diff --git a/CMakeLists.txt b/CMakeLists.txt
index 433081ee2256b..f3ed08d56e6d6 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,6 +60,7 @@ option(WITH_ONNXRUNTIME         "Compile PaddlePaddle with ONNXRUNTIME"
 # Note(zhouwei): It use option above, so put here
 include(init)
 include(generic)            # simplify cmake module
+include(experimental)       # experimental build options
 
 if (WITH_GPU  AND WITH_XPU)
     message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 69e66407580b6..92a526a2b58a7 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -52,6 +52,7 @@ if(NOT DEFINED CBLAS_PROVIDER)
   set(OPENBLAS_INCLUDE_SEARCH_PATHS
           ${OPENBLAS_ROOT}/include
           /usr/include
+          /usr/include/lapacke
           /usr/include/openblas
           /usr/local/opt/openblas/include)
   set(OPENBLAS_LIB_SEARCH_PATHS
@@ -65,15 +66,17 @@ if(NOT DEFINED CBLAS_PROVIDER)
     PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS} NO_DEFAULT_PATH)
   find_path(OPENBLAS_LAPACKE_INC_DIR NAMES lapacke.h
     PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
+  find_path(OPENBLAS_CONFIG_INC_DIR NAMES openblas_config.h
+    PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
   find_library(OPENBLAS_LIB NAMES openblas
     PATHS ${OPENBLAS_LIB_SEARCH_PATHS})
 
-  if(OPENBLAS_LAPACKE_INC_DIR AND OPENBLAS_INC_DIR AND OPENBLAS_LIB)
-    file(READ "${OPENBLAS_INC_DIR}/openblas_config.h" config_file)
+  if(OPENBLAS_LAPACKE_INC_DIR AND OPENBLAS_INC_DIR AND OPENBLAS_CONFIG_INC_DIR AND OPENBLAS_LIB)
+    file(READ "${OPENBLAS_CONFIG_INC_DIR}/openblas_config.h" config_file)
     string(REGEX MATCH "OpenBLAS ([0-9]+\.[0-9]+\.[0-9]+)" tmp ${config_file})
     string(REGEX MATCH "([0-9]+\.[0-9]+\.[0-9]+)" ver ${tmp})
     
-    if (${ver} VERSION_GREATER_EQUAL "0.3.7")
+    if (${ver} VERSION_GREATER_EQUAL "0.3.5")
       set(CBLAS_PROVIDER OPENBLAS)
       set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR} ${OPENBLAS_LAPACKE_INC_DIR})
       set(CBLAS_LIBRARIES ${OPENBLAS_LIB})
@@ -138,4 +141,3 @@ if(${CBLAS_PROVIDER} STREQUAL REFERENCE_CBLAS)
 elseif(NOT ${CBLAS_PROVIDER} STREQUAL MKLML)
   target_link_libraries(cblas ${CBLAS_LIBRARIES})
 endif()
-
diff --git a/cmake/experimental.cmake b/cmake/experimental.cmake
new file mode 100644
index 0000000000000..55e7fe263f9dc
--- /dev/null
+++ b/cmake/experimental.cmake
@@ -0,0 +1,17 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# this file contains experimental build options
+
+include(experiments/cuda_module_loading_lazy)
diff --git a/cmake/experiments/cuda_module_loading_lazy.cmake b/cmake/experiments/cuda_module_loading_lazy.cmake
new file mode 100644
index 0000000000000..ef6a51b594b9e
--- /dev/null
+++ b/cmake/experiments/cuda_module_loading_lazy.cmake
@@ -0,0 +1,40 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# this file contains experimental build options for lazy cuda module loading
+# cuda moduel lazy loading is supported by CUDA 11.6+
+# this experiment option makes Paddle supports lazy loading before CUDA 11.6.
+
+option(EXP_CUDA_MODULE_LOADING_LAZY  "enable lazy cuda module loading" OFF)
+if (${EXP_CUDA_MODULE_LOADING_LAZY})
+  if (NOT ${ON_INFER} OR NOT ${LINUX})
+    message("EXP_CUDA_MODULE_LOADING_LAZY only works with ON_INFER=ON on Linux platforms")
+    return()
+  endif ()
+  if (NOT ${CUDA_FOUND})
+    message("EXP_CUDA_MODULE_LOADING_LAZY only works with CUDA")
+    return()
+  endif ()
+  if (${CUDA_VERSION} VERSION_GREATER_EQUAL "11.6")
+    message("cuda 11.6+ already support lazy module loading")
+    return()
+  endif ()
+
+  message("for cuda before 11.6, libcudart.so must be used for the lazy module loading trick to work, instead of libcudart_static.a")
+  set(CUDA_USE_STATIC_CUDA_RUNTIME OFF CACHE BOOL "" FORCE)
+  set(CMAKE_CUDA_FLAGS "--cudart shared")
+  enable_language(CUDA)
+  set(CUDA_NVCC_EXECUTABLE "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy" CACHE FILEPATH "" FORCE)
+  set(CMAKE_CUDA_COMPILER "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy" CACHE FILEPATH "" FORCE)
+endif()
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 1c4a424995887..43d5002fe3819 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -9,7 +9,7 @@ SET(XPU_RT_LIB_NAME             "libxpurt.so")
 
 if(NOT DEFINED XPU_BASE_URL)
   SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220511")
+  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220520")
 else()
   SET(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()
@@ -17,7 +17,7 @@ endif()
 # ubuntu and centos: use output by XDNN API team
 if(NOT DEFINED XPU_XDNN_BASE_URL)
   SET(XPU_XDNN_BASE_URL_WITHOUT_DATE "https://klx-sdk-release-public.su.bcebos.com/xdnn/dev")
-  SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220511")
+  SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220520")
 else()
   SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL}")
 endif()
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index f9cac0579fec4..0dbd3bc328314 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -142,12 +142,10 @@ set(COMMON_FLAGS
     -Wno-unused-function
     -Wno-error=literal-suffix
     -Wno-error=unused-local-typedefs
-    -Wno-error=parentheses-equality # Warnings in pybind11
     -Wno-error=ignored-attributes  # Warnings in Eigen, gcc 6.3
     -Wno-error=terminate  # Warning in PADDLE_ENFORCE
     -Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2
     -Wimplicit-fallthrough=0 # Warning in tinyformat.h
-    -Wno-error=maybe-uninitialized # Warning in boost gcc 7.2
     ${fsanitize}
 )
 
diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h
index fca395c5f2bf7..52e09792d5d80 100644
--- a/paddle/fluid/distributed/collective/ProcessGroup.h
+++ b/paddle/fluid/distributed/collective/ProcessGroup.h
@@ -113,6 +113,19 @@ class ProcessGroup {
         "ProcessGroup%s does not support receive", GetBackendName()));
   }
 
+  virtual std::shared_ptr<ProcessGroup::Task> Send_Partial(phi::DenseTensor&,
+                                                           int, int,
+                                                           int) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support send", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Recv_Partial(
+      phi::DenseTensor& tensors, int, int, int) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support receive", GetBackendName()));
+  }
+
   virtual std::shared_ptr<ProcessGroup::Task> AllGather(
       std::vector<phi::DenseTensor>&,    // NOLINT
       std::vector<phi::DenseTensor>&) {  // NOLINT
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
index 86cc5b5db7cd7..f1b66864b2930 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -428,6 +428,53 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv(
   return task;
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send_Partial(
+    phi::DenseTensor& tensors, int dst_rank, int offset, int length) {
+  // CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
+
+  phi::DenseTensor flatten_tensor;
+  flatten_tensor.ShareDataWith(tensors).Resize({tensors.numel()});
+
+  phi::DenseTensor shared_input = flatten_tensor.Slice(offset, offset + length);
+
+  std::vector<phi::DenseTensor> shared_tensors;
+  shared_tensors.push_back(shared_input);
+
+  auto task = PointToPoint(shared_tensors,
+                           [&](phi::DenseTensor& input, ncclComm_t comm,
+                               const gpuStream_t& stream, int dst_rank) {
+                             return platform::dynload::ncclSend(
+                                 input.data(), input.numel(),
+                                 platform::ToNCCLDataType(input.dtype()),
+                                 dst_rank, comm, stream);
+                           },
+                           dst_rank, CommType::SEND);
+  return task;
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv_Partial(
+    phi::DenseTensor& tensors, int src_rank, int offset, int length) {
+  // phi::DenseTensor shared_input = tensors.Slice(offset, offset+length);
+
+  phi::DenseTensor flatten_tensor;
+  flatten_tensor.ShareDataWith(tensors).Resize({tensors.numel()});
+  phi::DenseTensor shared_input = flatten_tensor.Slice(offset, offset + length);
+
+  std::vector<phi::DenseTensor> shared_tensors;
+  shared_tensors.push_back(shared_input);
+
+  auto task = PointToPoint(shared_tensors,
+                           [&](phi::DenseTensor& output, ncclComm_t comm,
+                               const gpuStream_t& stream, int src_rank) {
+                             return platform::dynload::ncclRecv(
+                                 output.data(), output.numel(),
+                                 platform::ToNCCLDataType(output.dtype()),
+                                 src_rank, comm, stream);
+                           },
+                           src_rank, CommType::RECV);
+  return task;
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather(
     std::vector<phi::DenseTensor>& in_tensors,
     std::vector<phi::DenseTensor>& out_tensors) {
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
index 4b6c3f4031354..82ced6e135ac9 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
@@ -102,6 +102,14 @@ class ProcessGroupNCCL : public ProcessGroup {
   std::shared_ptr<ProcessGroup::Task> Recv(
       std::vector<phi::DenseTensor>& tensors, int src_rank) override;
 
+  std::shared_ptr<ProcessGroup::Task> Send_Partial(phi::DenseTensor& tensors,
+                                                   int dst_rank, int offset,
+                                                   int length) override;
+
+  std::shared_ptr<ProcessGroup::Task> Recv_Partial(phi::DenseTensor& tensors,
+                                                   int src_rank, int offset,
+                                                   int length) override;
+
   std::shared_ptr<ProcessGroup::Task> AllGather(
       std::vector<phi::DenseTensor>& in_tensors,
       std::vector<phi::DenseTensor>& out_tensors) override;
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc
index cacd55e02a5e2..d8f937e218be4 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model.cc
+++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc
@@ -546,9 +546,9 @@ bool DistModel::Run(const std::vector<DistModelTensor> &input_data,
 
   DistModelTimer timer;
   timer.tic();
-  double feed_elapse;
-  double fleet_exe_elapse;
-  double fetch_elapse;
+  double feed_elapse = 0;
+  double fleet_exe_elapse = 0;
+  double fetch_elapse = 0;
 
   if (!FeedData(input_data, scope_.get())) {
     LOG(ERROR) << "DistModel failed at feeding data.";
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc
index 2ed44ce489934..544e7c8fe85d6 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.cc
+++ b/paddle/fluid/eager/accumulation/accumulation_node.cc
@@ -28,33 +28,40 @@
 namespace egr {
 
 static void CopyOrAddTensor(paddle::experimental::Tensor* tensor,
-                            const paddle::experimental::Tensor& t) {
-  if (!tensor->defined() || !tensor->initialized()) {
-    // Simply copy tensor->impl
+                            const paddle::experimental::Tensor& t,
+                            bool is_fake_empty) {
+  if (is_fake_empty) {
     *tensor = t;
   } else {
-    // Accumulation
-    if (LIKELY(t.is_dense_tensor())) {
-      if (LIKELY(tensor->is_dense_tensor())) {
-        paddle::imperative::TensorAdd<paddle::experimental::Tensor>(t, tensor);
+    if (!tensor->defined() || !tensor->initialized()) {
+      // Simply copy tensor->impl
+      *tensor = t;
+    } else {
+      // Accumulation
+      if (LIKELY(t.is_dense_tensor())) {
+        if (LIKELY(tensor->is_dense_tensor())) {
+          paddle::imperative::TensorAdd<paddle::experimental::Tensor>(t,
+                                                                      tensor);
+        } else {
+          // TODO(jiabin): Support Other TensorBase later
+          // TODO(zhanlve): Replace SelectedRowsAddTensor with
+          // add_dygraph_function once it's supported
+          paddle::experimental::Tensor new_buffer(
+              std::make_shared<phi::DenseTensor>(), "tmp_accumulator");
+          paddle::imperative::SelectedRowsAddTensor(*tensor, t, &new_buffer);
+          tensor->set_impl(new_buffer.impl());
+        }
       } else {
         // TODO(jiabin): Support Other TensorBase later
         // TODO(zhanlve): Replace SelectedRowsAddTensor with
-        // add_dygraph_function once it's supported
-        paddle::experimental::Tensor new_buffer(
-            std::make_shared<phi::DenseTensor>(), "tmp_accumulator");
-        paddle::imperative::SelectedRowsAddTensor(*tensor, t, &new_buffer);
-        tensor->set_impl(new_buffer.impl());
-      }
-    } else {
-      // TODO(jiabin): Support Other TensorBase later
-      // TODO(zhanlve): Replace SelectedRowsAddTensor with add_dygraph_function
-      // once it's supported
-      if (tensor->is_dense_tensor()) {
-        paddle::imperative::SelectedRowsAddToTensor(t, tensor);
-      } else {
-        *tensor = std::move(*paddle::imperative::SelectedRowsMerge<
-                            paddle::experimental::Tensor>(t, *tensor));
+        // add_dygraph_function
+        // once it's supported
+        if (tensor->is_dense_tensor()) {
+          paddle::imperative::SelectedRowsAddToTensor(t, tensor);
+        } else {
+          *tensor = std::move(*paddle::imperative::SelectedRowsMerge<
+                              paddle::experimental::Tensor>(t, *tensor));
+        }
       }
     }
   }
@@ -91,7 +98,8 @@ GradNodeAccumulation::operator()(
 
   if (!weak_grad_.expired() && !is_new_grad) {
     auto grad = weak_grad_.lock();
-    CopyOrAddTensor(grad.get(), grad_out);
+    CopyOrAddTensor(grad.get(), grad_out, is_fake_empty_);
+    is_fake_empty_ = false;
   }
 
   // Apply Reduce Hooks
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h
index f37de9c8e88f1..6374534578cb8 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.h
+++ b/paddle/fluid/eager/accumulation/accumulation_node.h
@@ -64,14 +64,16 @@ class GradNodeAccumulation : public GradNodeBase {
         new GradNodeAccumulation(nullptr));
   }
 
+  void SetFakeEmpty(bool is_fake_empty) { is_fake_empty_ = is_fake_empty; }
+
  private:
+  // TODO(Jiabin): remove this when we make our clear gradient really cleared;
+  bool is_fake_empty_ = {false};
   std::weak_ptr<paddle::experimental::Tensor> weak_grad_;
-
+  std::vector<std::shared_ptr<TensorVoidHook>> reduce_hooks_;
   std::function<paddle::experimental::Tensor(
       const paddle::experimental::Tensor&)>
       retain_grad_hook_;
-
-  std::vector<std::shared_ptr<TensorVoidHook>> reduce_hooks_;
 };
 
 }  // namespace egr
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index 3edd13ccd597f..817a0de6e0ca9 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -231,6 +231,15 @@ class GradNodeGenerationInfo {
       return &no_need_buffer_ins_;
     }
 
+    const std::unordered_map<std::string, std::string>& GetBackwardInplaceMap()
+        const {
+      return backward_inplace_map_;
+    }
+    std::unordered_map<std::string, std::string>*
+    GetMutableBackwardInplaceMap() {
+      return &backward_inplace_map_;
+    }
+
    private:
     std::string op_base_type_;
     std::map<std::string, std::string> grad_outs_slotname_map_;
@@ -244,6 +253,7 @@ class GradNodeGenerationInfo {
         grad_outs_;
     paddle::framework::AttributeMap grad_attrs_;
     std::unordered_set<std::string> no_need_buffer_ins_;
+    std::unordered_map<std::string, std::string> backward_inplace_map_;
   };
 
  public:
@@ -979,6 +989,12 @@ static bool CollectGradInformationFromOpInfo(
       *(*op_base_infos)[index].GetMutableNoNeedBufferInputs() =
           inferer(g_ins, g_outs, *op_base_grad_attrs);
     }
+
+    auto& infer_backward_inplace = op_base.Info().infer_inplace_;
+    if (infer_backward_inplace) {
+      *(*op_base_infos)[index].GetMutableBackwardInplaceMap() =
+          infer_backward_inplace(true);
+    }
   }
 
   /* ------ Slot Name Matching ---- */
@@ -1005,7 +1021,7 @@ static std::string GenerateGradNodeCreationContent(
     const ForwardGenerationInfo& fwd_info,
     const GradNodeGenerationInfo& bwd_info,
     const std::string& trace_op_body_str,
-    std::map<std::string, std::string> inplace_map = {}) {
+    std::map<std::string, std::string> forward_inplace_map = {}) {
   VLOG(6) << "Generating GradNode Creation codes";
 
   const std::string& op_type = fwd_info.GetOpType();
@@ -1045,8 +1061,10 @@ static std::string GenerateGradNodeCreationContent(
     } else {
       // In inplace op, the case where output is duplicable is not considered.
       // Replace output directly with input in inplace op.
-      if (!inplace_map.empty() && inplace_map.count(output_name)) {
-        auto inplace_input_name = LegalizeVarName(inplace_map[output_name]);
+      if (!forward_inplace_map.empty() &&
+          forward_inplace_map.count(output_name)) {
+        auto inplace_input_name =
+            LegalizeVarName(forward_inplace_map[output_name]);
         const std::string& inplace_input_autograd_name =
             "p_autograd_" + inplace_input_name;
         const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
@@ -1103,12 +1121,12 @@ static std::string GenerateGradNodeCreationContent(
   // check inplace input to avoid inplace operations on leaf nodes with
   // stop_gradient=False.
   std::string check_inplace_str = "";
-  if (!inplace_map.empty()) {
+  if (!forward_inplace_map.empty()) {
     const char* CHECKING_INPLACE_TEMPLATE =
         "  // Check Inplace\n"
         "  egr::EagerUtils::CheckInplace(%s, p_autograd_%s, "
         "require_any_grad);\n";
-    for (auto& inplace_pair : inplace_map) {
+    for (auto& inplace_pair : forward_inplace_map) {
       std::string inplace_name = LegalizeVarName(inplace_pair.second);
       check_inplace_str += paddle::string::Sprintf(CHECKING_INPLACE_TEMPLATE,
                                                    inplace_name, inplace_name);
@@ -1161,8 +1179,9 @@ static std::string GenerateGradNodeCreationContent(
       const char* SET_TENSOR_WRAPPER_TEMPLATE =
           "      grad_node->SetTensorWrapper%s(%s);\n";
       // Replace output directly with input in inplace op.
-      if (!inplace_map.empty() && inplace_map.count(tensor_wrapper_name)) {
-        auto inplace_input_name = inplace_map[tensor_wrapper_name];
+      if (!forward_inplace_map.empty() &&
+          forward_inplace_map.count(tensor_wrapper_name)) {
+        auto inplace_input_name = forward_inplace_map[tensor_wrapper_name];
         grad_node_creation_str += paddle::string::Sprintf(
             SET_TENSOR_WRAPPER_TEMPLATE, LegalizeVarName(tensor_wrapper_name),
             LegalizeVarName(inplace_input_name));
@@ -1213,8 +1232,9 @@ static std::string GenerateGradNodeCreationContent(
   for (const proto::OpProto::Var& output : out_vars) {
     const std::string& output_name = output.name();
     // Replace output directly with input in inplace op.
-    if (!inplace_map.empty() && inplace_map.count(output_name)) {
-      auto inplace_input_name = inplace_map[output_name];
+    if (!forward_inplace_map.empty() &&
+        forward_inplace_map.count(output_name)) {
+      auto inplace_input_name = forward_inplace_map[output_name];
       const std::string& inplace_input_autograd_name =
           "p_autograd_" + LegalizeVarName(inplace_input_name);
       size_t output_position = fwd_outputs_name_pos_map.at(output_name);
@@ -1345,7 +1365,7 @@ static std::string GenerateGradNodeCreationContent(
 static std::pair<std::string, std::string> GenerateForwardFunctionContents(
     const ForwardGenerationInfo& fwd_info,
     const GradNodeGenerationInfo& bwd_info,
-    std::map<std::string, std::string> inplace_map = {}) {
+    std::map<std::string, std::string> forward_inplace_map = {}) {
   /* --- Process Forward Info ---*/
   const std::string& op_type = fwd_info.GetOpType();
   const std::unordered_map<std::string, size_t>& fwd_inputs_name_pos_map =
@@ -1434,8 +1454,8 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
       // inplace tensor can't be const
       const char* FWD_INS_ARG_TEMPLATE;
       bool flag_find_input_name = false;
-      if (!inplace_map.empty()) {
-        for (auto& inplace_pair : inplace_map) {
+      if (!forward_inplace_map.empty()) {
+        for (auto& inplace_pair : forward_inplace_map) {
           if (inplace_pair.second == input_name) {
             flag_find_input_name = true;
             FWD_INS_ARG_TEMPLATE = "paddle::experimental::Tensor& %s";
@@ -1605,15 +1625,16 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
       }
       core_ops_args_info[op_type].push_back(output_name);
 
-    } else if (!inplace_map.empty() && inplace_map.count(output_name)) {
+    } else if (!forward_inplace_map.empty() &&
+               forward_inplace_map.count(output_name)) {
       // In inplace op, replace the output with the input directly.
       PADDLE_ENFORCE_NE(
-          inplace_map[output_name], "",
+          forward_inplace_map[output_name], "",
           paddle::platform::errors::InvalidArgument(
               "Inplace op %s has no input corresponding to output %s.", op_type,
               output_name));
       const char* FWD_OUTS_CONTENT_TEMPLATE = "{ \"%s\", ins[\"%s\"] },";
-      auto inplace_input_name = inplace_map[output_name];
+      auto inplace_input_name = forward_inplace_map[output_name];
       outs_contents_str += paddle::string::Sprintf(
           FWD_OUTS_CONTENT_TEMPLATE, output_name, inplace_input_name);
 
@@ -1651,7 +1672,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   if (inplace_mapping_str.size() > 0)
     inplace_mapping_str.pop_back();  // Remove trailing ","
 
-  if ((op_type != "cast") && (inplace_map.empty())) {
+  if ((op_type != "cast") && (forward_inplace_map.empty())) {
     VLOG(6) << "Generating Dygraph Forward AMP";
     const char* AMP_LOGIC_CONTEXT =
         "  if (egr::Controller::Instance().GetAMPLevel() != "
@@ -1743,7 +1764,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   VLOG(6) << "Generated Outs Map";
 
   // [Generation] Apply View Strategy (Tensor)
-  if (inplace_map.empty() && view_op_map.count(op_type)) {
+  if (forward_inplace_map.empty() && view_op_map.count(op_type)) {
     const char* HANDLE_VIEW_BETWEEN_INPUT_AND_OUTPUT =
         "  if (ins.count(\"%s\") && outs.count(\"%s\")) {\n"
         "    egr::EagerUtils::HandleViewBetweenInputAndOutput(ins[\"%s\"][0], "
@@ -1852,10 +1873,11 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
               output_varname, output_var_args_name);
         }
       } else {
-        if (!inplace_map.empty() && inplace_map.count(output_name)) {
+        if (!forward_inplace_map.empty() &&
+            forward_inplace_map.count(output_name)) {
           // Modify meta info of inplace tensor.
           // Bump inplace version of inplace tensor.
-          auto inplace_input_name = inplace_map[output_name];
+          auto inplace_input_name = forward_inplace_map[output_name];
           const char* FWD_OUT_TENSOR_TEMPLATE =
               "  egr::EagerUtils::GetOutput(outs[\"%s\"][0], &%s);\n"
               "  %s.bump_inplace_version();\n"
@@ -1878,10 +1900,11 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
       return_types[return_position] = "paddle::experimental::Tensor";
     }
 
-    if (!inplace_map.empty() && inplace_map.count(output_name)) {
+    if (!forward_inplace_map.empty() &&
+        forward_inplace_map.count(output_name)) {
       // Replace output directly with input in inplace op.
       return_contents[return_position] =
-          LegalizeVarName(inplace_map[output_name]);
+          LegalizeVarName(forward_inplace_map[output_name]);
     } else {
       return_contents[return_position] = output_varname;
     }
@@ -1903,7 +1926,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
     // If GradNode needs to be generated, pass `trace_op_body_str`
     // into `GenerateGradNodeCreationContent`.
     std::string grad_node_creation_body_str = GenerateGradNodeCreationContent(
-        fwd_info, bwd_info, trace_op_body_str, inplace_map);
+        fwd_info, bwd_info, trace_op_body_str, forward_inplace_map);
 
     generated_function_body += grad_node_creation_body_str;
     generated_function_body += "\n";
@@ -1960,7 +1983,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
 
   // [Generation] Get Full Function
   std::string function_name;
-  if (inplace_map.empty()) {
+  if (forward_inplace_map.empty()) {
     function_name = op_type + "_dygraph_function";
   } else {
     // change function_name for inplace op.
@@ -2013,12 +2036,39 @@ static std::string GenerateSingleOpBase(
         std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>&
         grad_outs,
     const paddle::framework::AttributeMap& grad_attrs,
+    const std::unordered_map<std::string, std::string>& backward_inplace_map,
     bool is_op_base_per_duplicable_input, size_t* outs_size) {
   std::string generated_grad_function_body = "";
 
   const std::string& ins_name = "ins" + std::to_string(*outs_size);
   const std::string& outs_name = "outs" + std::to_string(*outs_size);
   const std::string& attrs_name = "attrs_map" + std::to_string(*outs_size);
+  const std::string& hooked_grads = "hooked_grads" + std::to_string(*outs_size);
+
+  // [Generation] Get Full Zero
+  std::string fill_zero_str = "";
+  if (ops_to_fill_zero_for_empty_grads.count(fwd_op_type)) {
+    for (auto iter : grad_ins) {
+      const std::string& grad_input_name = iter.first;
+      if (grad_ins_grad_slotname_map.count(grad_input_name)) {
+        size_t fwd_output_position = fwd_outputs_name_pos_map.at(
+            grad_ins_grad_slotname_map.at(grad_input_name));
+        const char* FILL_ZERO_TEMPLATE =
+            "egr::EagerUtils::FillZeroForEmptyOptionalGradInput(&grads[%d], "
+            "this->InputMeta()[%d]);\n";
+        fill_zero_str += paddle::string::Sprintf(
+            FILL_ZERO_TEMPLATE, fwd_output_position, fwd_output_position);
+      }
+    }
+  }
+  generated_grad_function_body += fill_zero_str;
+  generated_grad_function_body +=
+      "  paddle::small_vector<std::vector<paddle::experimental::Tensor>, "
+      "egr::kSlotSmallVectorSize> " +
+      hooked_grads +
+      " = "
+      "GradNode" +
+      fwd_op_type + "::ApplyGradientHooks(grads);\n";
 
   // [Generation] Get Ins Map
   std::unordered_set<std::string> dispensable_input_name_set;
@@ -2029,6 +2079,23 @@ static std::string GenerateSingleOpBase(
   for (const auto& in : in_vars) {
     if (in.duplicable()) duplicable_input_name_set.insert(in.name());
   }
+  const char* CHECK_BACKWARD_INPLACE_TEMPLATE =
+      "  // Check backward inplace info\n"
+      "  bool %s = false;\n"
+      "  %s\n"
+      "  if (%s.initialized()) {\n"
+      "    VLOG(10) << %s.name() << \"(%s) use_count: \" << "
+      "%s.impl().use_count();\n"
+      "    if (%s.impl().use_count() == 1 || (%s.impl().use_count() == 2 && "
+      "%s.impl().get() == %s.impl().get())) {\n"
+      "      %s = true;\n"
+      "    }\n"
+      "  }\n";
+  const std::string& can_be_inplaced_name =
+      "can_be_inplaced" + std::to_string(*outs_size);
+  const std::string& bwd_inplace_input_name =
+      "backward_inplace_tensor" + std::to_string(*outs_size);
+  bool process_backward_inplace = false;
   std::string ins_contents_str = "";
   for (auto iter : grad_ins) {
     const std::string& grad_input_name = iter.first;
@@ -2051,16 +2118,52 @@ static std::string GenerateSingleOpBase(
       ins_contents_str +=
           paddle::string::Sprintf(GRAD_INS_FWD_CONTENT_TEMPLATE,
                                   grad_input_name, struct_fwd_input_name);
-
+      if (!backward_inplace_map.empty() &&
+          backward_inplace_map.count(grad_input_name)) {
+        process_backward_inplace = true;
+        const char* GRAD_INS_FWD_TENSOR_WRAPPER_TEMPLATE =
+            "auto %s = egr::EagerUtils::RecoverTensorWrapper(&this->%s);";
+        std::string tensor_wrapper_str = paddle::string::Sprintf(
+            GRAD_INS_FWD_TENSOR_WRAPPER_TEMPLATE, bwd_inplace_input_name,
+            struct_fwd_input_name);
+        const char* GRAD_INS_FWD_TENSOR_TEMPLATE =
+            "(&this->%s)->get_intermidiate_tensor()";
+        std::string tensor_wrapper_intermidiate_tensor_str =
+            paddle::string::Sprintf(GRAD_INS_FWD_TENSOR_TEMPLATE,
+                                    struct_fwd_input_name);
+        generated_grad_function_body += paddle::string::Sprintf(
+            CHECK_BACKWARD_INPLACE_TEMPLATE, can_be_inplaced_name,
+            tensor_wrapper_str, bwd_inplace_input_name, bwd_inplace_input_name,
+            grad_input_name, bwd_inplace_input_name, bwd_inplace_input_name,
+            bwd_inplace_input_name, bwd_inplace_input_name,
+            tensor_wrapper_intermidiate_tensor_str, can_be_inplaced_name);
+      }
     } else if (grad_ins_grad_slotname_map.count(grad_input_name)) {
       // Fwd Tensor's Grad
       size_t fwd_output_position = fwd_outputs_name_pos_map.at(
           grad_ins_grad_slotname_map.at(grad_input_name));
       const char* GRAD_INS_GRAD_CONTENT_TEMPLATE =
-          "{ \"%s\", egr::EagerUtils::TrySyncToVars(hooked_grads[%d]) },";
+          "{ \"%s\", egr::EagerUtils::TrySyncToVars(%s[%d]) },";
       ins_contents_str += paddle::string::Sprintf(
-          GRAD_INS_GRAD_CONTENT_TEMPLATE, grad_input_name, fwd_output_position);
-
+          GRAD_INS_GRAD_CONTENT_TEMPLATE, grad_input_name, hooked_grads,
+          fwd_output_position);
+      if (!backward_inplace_map.empty() &&
+          backward_inplace_map.count(grad_input_name)) {
+        process_backward_inplace = true;
+        const char* GRAD_INS_HOOKED_GRAD_TEMPLATE = "auto& %s = %s[%d][0];";
+        std::string hooked_grads_tensor_str = paddle::string::Sprintf(
+            GRAD_INS_HOOKED_GRAD_TEMPLATE, bwd_inplace_input_name, hooked_grads,
+            fwd_output_position);
+        const char* GRAD_INS_GRAD_TENSOR_TEMPLATE = "grads[%d][0]";
+        std::string grads_tensor_str = paddle::string::Sprintf(
+            GRAD_INS_GRAD_TENSOR_TEMPLATE, fwd_output_position);
+        generated_grad_function_body += paddle::string::Sprintf(
+            CHECK_BACKWARD_INPLACE_TEMPLATE, can_be_inplaced_name,
+            hooked_grads_tensor_str, bwd_inplace_input_name,
+            bwd_inplace_input_name, grad_input_name, bwd_inplace_input_name,
+            bwd_inplace_input_name, bwd_inplace_input_name,
+            bwd_inplace_input_name, grads_tensor_str, can_be_inplaced_name);
+      }
     } else {
       PADDLE_THROW(platform::errors::Fatal(
           "Detected mismatched slot names."
@@ -2112,7 +2215,6 @@ static std::string GenerateSingleOpBase(
   }
 
   VLOG(6) << "Generated Ins Map";
-
   // [Generation] Get Outs Map
   std::string outs_contents_str = "";
   for (auto iter : grad_outs) {
@@ -2161,9 +2263,12 @@ static std::string GenerateSingleOpBase(
         size_t grads_position = fwd_outputs_name_pos_map.at(fwd_name);
 
         const char* GRAD_OUTS_CONTENT_TEMPLATE =
-            "{ \"%s\", egr::EagerUtils::TrySyncToVars(hooked_grads[%d]) },";
+            " if((!out_metas[%d].empty()) && "
+            "(!(out_metas[%d][0].IsStopGradient()))){ \n %s.insert({ \"%s\", "
+            "egr::EagerUtils::TrySyncToVars(%s[%d])});} \n ";
         outs_contents_str += paddle::string::Sprintf(
-            GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name, grads_position);
+            GRAD_OUTS_CONTENT_TEMPLATE, grads_position, grads_position,
+            outs_name, grad_output_name, hooked_grads, grads_position);
 
       } else {
         if (dispensable_input_name_set.count(fwd_name) &&
@@ -2174,18 +2279,20 @@ static std::string GenerateSingleOpBase(
         if (duplicable_input_name_set.count(fwd_name) &&
             !is_op_base_per_duplicable_input) {
           const char* GRAD_OUTS_CONTENT_TEMPLATE =
-              "{ \"%s\", egr::EagerUtils::CreateVars( "
-              "this->OutputMeta()[%d].size() ) },";
+              " if(!out_metas[%d].empty()){ %s.insert({ \"%s\", "
+              "egr::EagerUtils::CreateVars(out_metas[%d].size())});} \n ";
           outs_contents_str += paddle::string::Sprintf(
-              GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name, fwd_input_position);
+              GRAD_OUTS_CONTENT_TEMPLATE, fwd_input_position, outs_name,
+              grad_output_name, fwd_input_position);
         } else {
           const char* GRAD_OUTS_CONTENT_TEMPLATE =
-              "{ \"%s\", "
+              " if((!out_metas[%d].empty()) && "
+              "(!(out_metas[%d][0].IsStopGradient()))){ %s.insert({ \"%s\", "
               "{std::make_shared<egr::EagerVariable>(egr::Controller::Instance("
-              ")."
-              "GenerateUniqueName())}},";
+              ").GenerateUniqueName())}});} \n ";
           outs_contents_str += paddle::string::Sprintf(
-              GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name);
+              GRAD_OUTS_CONTENT_TEMPLATE, fwd_input_position,
+              fwd_input_position, outs_name, grad_output_name);
         }
       }
     } else {
@@ -2195,16 +2302,15 @@ static std::string GenerateSingleOpBase(
           grad_output_name));
     }
   }
-  if (outs_contents_str.size() > 0)
-    outs_contents_str.pop_back();  // // Remove trailing ","
 
   const char* BWD_OUTS_MAP_TEMPLATE =
       "  std::map<std::string, "
-      "std::vector<std::shared_ptr<egr::EagerVariable>>> %s = { "
-      "%s };\n";
-  std::string outs_map_str = paddle::string::Sprintf(
-      BWD_OUTS_MAP_TEMPLATE, outs_name, outs_contents_str);
+      "std::vector<std::shared_ptr<egr::EagerVariable>>> %s;\n";
+  std::string outs_map_str =
+      paddle::string::Sprintf(BWD_OUTS_MAP_TEMPLATE, outs_name);
+
   generated_grad_function_body += outs_map_str;
+  generated_grad_function_body += outs_contents_str;
   generated_grad_function_body += "\n";
   for (auto iter : grad_outs) {
     const std::string& grad_output_name = iter.first;
@@ -2219,18 +2325,23 @@ static std::string GenerateSingleOpBase(
               !is_op_base_per_duplicable_input) {
             size_t fwd_input_position = fwd_inputs_name_pos_map.at(fwd_name);
             const char* DISPENSABLE_GRAD_OUTS_FWD_CONTENT_TEMPLATE =
-                "  if(%s.size() > 0) %s[\"%s\"] = egr::EagerUtils::CreateVars( "
-                "this->OutputMeta()[%d].size() );\n";
+                "  if((%s.size() > 0) && (!out_metas[%d].empty()) && "
+                "(!out_metas[%d][0].IsStopGradient())) %s[\"%s\"] = "
+                "egr::EagerUtils::CreateVars( "
+                "out_metas[%d].size() );\n";
             generated_grad_function_body += paddle::string::Sprintf(
                 DISPENSABLE_GRAD_OUTS_FWD_CONTENT_TEMPLATE, fwd_name, outs_name,
                 grad_output_name, fwd_input_position);
           } else {
+            size_t fwd_input_position = fwd_inputs_name_pos_map.at(fwd_name);
             const char* DISPENSABLE_GRAD_OUTS_FWD_CONTENT_TEMPLATE =
-                "  if(%s.defined()) %s[\"%s\"] = "
+                "  if(%s.defined() && (!out_metas[%d].empty()) && "
+                "(!out_metas[%d][0].IsStopGradient())) %s[\"%s\"] = "
                 "{std::make_shared<egr::EagerVariable>(egr::Controller::"
                 "Instance().GenerateUniqueName())};\n";
             generated_grad_function_body += paddle::string::Sprintf(
-                DISPENSABLE_GRAD_OUTS_FWD_CONTENT_TEMPLATE, fwd_name, outs_name,
+                DISPENSABLE_GRAD_OUTS_FWD_CONTENT_TEMPLATE, fwd_name,
+                fwd_input_position, fwd_input_position, outs_name,
                 grad_output_name);
           }
         }
@@ -2245,6 +2356,27 @@ static std::string GenerateSingleOpBase(
 
   VLOG(6) << "Generated Outs Map";
 
+  // [Generation] Process Backward Inplace
+  if (process_backward_inplace) {
+    const char* HANDLE_BACKWARD_INPLACE_BETWEEN_INPUT_AND_OUTPUT =
+        "  if (%s && %s.count(\"%s\") && %s.count(\"%s\")) {\n"
+        "    egr::EagerUtils::HandleViewBetweenInputAndOutput(%s[\"%s\"][0], "
+        "%s[\"%s\"][0]);\n"
+        "  };\n";
+    std::string backward_inplace_map_str = "";
+    for (auto iter : backward_inplace_map) {
+      std::string backward_inplace_input_name = iter.first;
+      std::string backward_inplace_output_name = iter.second;
+      backward_inplace_map_str += paddle::string::Sprintf(
+          HANDLE_BACKWARD_INPLACE_BETWEEN_INPUT_AND_OUTPUT,
+          can_be_inplaced_name, ins_name, backward_inplace_input_name,
+          outs_name, backward_inplace_output_name, ins_name,
+          backward_inplace_input_name, outs_name, backward_inplace_output_name);
+    }
+    generated_grad_function_body += backward_inplace_map_str;
+    VLOG(6) << "Process Backward Inplace";
+  }
+
   // [Generation] Get Attrs Map
   const char* ATTRS_TEMPLATE = "  auto& %s = this->attr_map_;\n";
   std::string grad_attrs_str =
@@ -2289,16 +2421,20 @@ static std::string GenerateSingleOpBase(
       size_t fwd_input_position = fwd_inputs_name_pos_map.at(fwd_name);
       if (!is_op_base_per_duplicable_input) {
         const char* BWD_OUTPUT_TEMPLATE =
-            "  outputs[%d] = egr::EagerUtils::GetOutputs(%s[\"%s\"]);\n";
+            " if (%s.find(\"%s\") != %s.end()) { outputs[%d] = "
+            "egr::EagerUtils::GetOutputs(%s[\"%s\"]); }\n";
         outputs_str += paddle::string::Sprintf(
-            BWD_OUTPUT_TEMPLATE, fwd_input_position, outs_name, grad_out_name);
+            BWD_OUTPUT_TEMPLATE, outs_name, grad_out_name, outs_name,
+            fwd_input_position, outs_name, grad_out_name);
       } else {
         const char* BWD_OUTPUT_TEMPLATE =
             "  "
+            "if (%s.find(\"%s\") != %s.end()) { "
             "outputs[0].emplace_back(egr::EagerUtils::GetOutputs(%s[\"%s\"])[0]"
-            ");\n";
+            "); }\n";
         outputs_str += paddle::string::Sprintf(BWD_OUTPUT_TEMPLATE, outs_name,
-                                               grad_out_name);
+                                               grad_out_name, outs_name,
+                                               outs_name, grad_out_name);
       }
       num_appended_outputs++;
     } else {
@@ -2317,9 +2453,11 @@ static std::string GenerateSingleOpBase(
 
     if (fwd_outputs_name_pos_map.count(fwd_name)) {
       const char* BWD_OUTPUT_TEMPLATE =
-          "  outputs[%d] = egr::EagerUtils::GetOutputs(%s[\"%s\"]);\n";
+          "  if (%s.find(\"%s\") != %s.end()) { outputs[%d] = "
+          "egr::EagerUtils::GetOutputs(%s[\"%s\"]); }\n";
       outputs_str += paddle::string::Sprintf(
-          BWD_OUTPUT_TEMPLATE, num_appended_outputs, outs_name, grad_out_name);
+          BWD_OUTPUT_TEMPLATE, outs_name, grad_out_name, outs_name,
+          num_appended_outputs, outs_name, grad_out_name);
       num_appended_outputs++;
     }
   }
@@ -2428,13 +2566,15 @@ static std::string GenerateGradNodeCCContents(
     const auto& grad_ins = op_base_info.GetGradIns();
     const auto& grad_outs = op_base_info.GetGradOuts();
     const auto& grad_attrs = op_base_info.GetGradAttrs();
+    const auto& backward_inplace_map = op_base_info.GetBackwardInplaceMap();
 
     const std::string& op_base_type = op_base_info.GetOpBaseType();
     generated_grad_function_body += GenerateSingleOpBase(
         fwd_op_type, op_base_type, fwd_inputs_name_pos_map,
         fwd_outputs_name_pos_map, in_vars, grad_ins_fwd_slotname_map,
         grad_ins_grad_slotname_map, grad_outs_slotname_map, grad_ins, grad_outs,
-        grad_attrs, is_op_base_per_duplicable_input, &outs_size);
+        grad_attrs, backward_inplace_map, is_op_base_per_duplicable_input,
+        &outs_size);
   }
 
   if (is_op_base_per_duplicable_input) {
@@ -2447,18 +2587,15 @@ static std::string GenerateGradNodeCCContents(
   }
 
   const char* BWD_RETURN_TEMPLATE =
-      "  paddle::small_vector<std::vector<paddle::experimental::Tensor>, "
-      "egr::kSlotSmallVectorSize> hooked_grads = "
-      "GradNode%s::ApplyGradientHooks(grads);\n"
+      "  const auto& out_metas = OutputMeta();\n"
       "  paddle::small_vector<std::vector<paddle::experimental::Tensor>, "
       "egr::kSlotSmallVectorSize> outputs(%d);\n"
       "  %s\n"
       "  if(NeedComplexToRealConversion()) "
       "HandleComplexGradToRealGrad(&outputs);\n"
       "  return outputs;\n";
-  generated_grad_function_body =
-      paddle::string::Sprintf(BWD_RETURN_TEMPLATE, fwd_op_type, in_vars.size(),
-                              generated_grad_function_body);
+  generated_grad_function_body = paddle::string::Sprintf(
+      BWD_RETURN_TEMPLATE, in_vars.size(), generated_grad_function_body);
 
   // [Generation] Get Full Grad Function
   const char* GRAD_FUNCTION_TEMPLATE =
@@ -2469,17 +2606,9 @@ static std::string GenerateGradNodeCCContents(
       "egr::kSlotSmallVectorSize>& grads, bool "
       "create_graph, bool is_new_grad) {\n"
       "%s"
-      "%s"
       "\n}";
-  std::string fill_zero_str = "";
-  if (ops_to_fill_zero_for_empty_grads.count(fwd_op_type)) {
-    fill_zero_str =
-        "egr::EagerUtils::FillZeroForEmptyGradInputs(&grads, "
-        "this->InputMeta());\n";
-  }
-  std::string grad_function_str =
-      paddle::string::Sprintf(GRAD_FUNCTION_TEMPLATE, fwd_op_type,
-                              fill_zero_str, generated_grad_function_body);
+  std::string grad_function_str = paddle::string::Sprintf(
+      GRAD_FUNCTION_TEMPLATE, fwd_op_type, generated_grad_function_body);
 
   VLOG(6) << "Generated returns";
 
@@ -2847,19 +2976,20 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
 
     auto& infer_inplace =
         paddle::framework::OpInfoMap::Instance().Get(op_type).infer_inplace_;
-    std::map<std::string, std::string> inplace_map;
+    std::map<std::string, std::string> forward_inplace_map;
     // Inplace Function Generator.
     // `sum` op has duplicate input. Don't consider adding inplace strategy
     // for `sum` in temporary.
     if (infer_inplace && !special_inplace_op_set.count(op_type)) {
       auto in_to_outs = infer_inplace(true);
       for (auto& inplace_pair : in_to_outs) {
-        inplace_map[inplace_pair.second] = inplace_pair.first;
+        forward_inplace_map[inplace_pair.second] = inplace_pair.first;
       }
 
       VLOG(6) << "-------- GenerateInplaceForwardFunctionContents -------";
       std::pair<std::string, std::string> inplace_body_and_declaration =
-          GenerateForwardFunctionContents(fwd_info, bwd_info, inplace_map);
+          GenerateForwardFunctionContents(fwd_info, bwd_info,
+                                          forward_inplace_map);
 
       fwd_function_str += inplace_body_and_declaration.first + "\n";
 
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
index 3f4fcc4608eeb..57681be58ae47 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
@@ -28,7 +28,10 @@
     "multiply_triple_grad", "conv2d_grad_grad", "batch_norm_double_grad",
     "tanh_double_grad", "tanh_triple_grad", "subtract_double_grad",
     "divide_double_grad", "log_double_grad", "elu_double_grad",
-    "leaky_relu_double_grad"
+    "leaky_relu_double_grad", "sqrt_double_grad", "rsqrt_double_grad",
+    "square_double_grad", "celu_double_grad", "pad_double_grad",
+    "pad3d_double_grad", "squeeze_double_grad", "unsqueeze_double_grad",
+    "conv3d_double_grad", "depthwise_conv2d_grad_grad"
 ])
 
 # For API dispatch used at python-level
@@ -173,7 +176,10 @@ def RecoverBaseNameOfInplaceFunction(function_name):
 
 
 def GetInplacedFunctionName(function_name):
-    return function_name + "_"
+    inplace_func_name = function_name
+    if inplace_func_name[-1] != '_':
+        inplace_func_name += '_'
+    return inplace_func_name
 
 
 def GetForwardFunctionName(string):
@@ -307,6 +313,23 @@ def ParseYamlBackward(args_str, returns_str):
     return inputs_list, attrs_list, returns_list
 
 
+def ParseYamlInplaceInfo(string):
+    # inplace_map_str: "(x -> out0), (y -> out2)"
+    inplace_map = {}
+    for pair in string.split(","):
+        pair = pair.strip()
+        if pair.startswith("("):
+            pair = pair[1:]
+
+        if pair.endswith(")"):
+            pair = pair[:-1]
+
+        key = pair.split("->")[0].strip()
+        val = pair.split("->")[1].strip()
+        inplace_map[key] = val
+    return inplace_map
+
+
 ########################
 ###  Generator Base  ###
 ########################
@@ -334,25 +357,14 @@ def __init__(self, forward_api_contents, namespace):
         self.optional_inputs = []  #[name, ...]
         self.no_need_buffers = []  #[name, ...]
         self.intermediate_outputs = []  #[name, ...]    
-        self.inplace_map = {}  #{name : name, ...}
+        self.forward_inplace_map = {}  #{name : name, ...}
 
-    def ParseInplaceInfo(self):
+    def ParseForwardInplaceInfo(self):
         forward_api_contents = self.forward_api_contents
         if 'inplace' not in forward_api_contents.keys(): return
 
-        # inplace_map_str: "(x -> out0), (y -> out2)"
         inplace_map_str = forward_api_contents['inplace']
-        for pair in inplace_map_str.split(","):
-            pair = pair.strip()
-            if pair.startswith("("):
-                pair = pair[1:]
-
-            if pair.endswith(")"):
-                pair = pair[:-1]
-
-            key = pair.split("->")[0].strip()
-            val = pair.split("->")[1].strip()
-            self.inplace_map[key] = val
+        self.forward_inplace_map = ParseYamlInplaceInfo(inplace_map_str)
 
     def ParseNoNeedBuffer(self):
         grad_api_contents = self.grad_api_contents
@@ -436,7 +448,7 @@ def ParseForwardYamlContents(self):
 
     def InferNameSpace(self):
         api_yaml_path = self.api_yaml_path
-        if "sparse" in api_yaml_path:
+        if re.search(r"sparse[a-zA-Z0-9_]*\.yaml", api_yaml_path):
             self.namespace = "sparse::"
-        elif "strings" in api_yaml_path:
+        elif re.search(r"strings[a-zA-Z0-9_]*\.yaml", api_yaml_path):
             self.namespace = "strings::"
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index 1ce5216ddce9d..d8b909c3bacc1 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -29,10 +29,18 @@
 from codegen_utils import GetInplacedFunctionName
 from codegen_utils import ParseYamlArgs, ParseYamlReturns, ParseYamlForwardFromBackward
 from codegen_utils import ParseYamlForward, ParseYamlBackward
+from codegen_utils import ParseYamlInplaceInfo
 from codegen_utils import FunctionGeneratorBase, GeneratorBase
 from codegen_utils import ops_to_fill_zero_for_empty_grads
 from codegen_utils import AssertMessage, GetIndent
 
+# Note: assign is a inplace api when parameter(output) isn't none,
+# so we should check parameter(output) with rule of inplace.
+# But because there is no check in old dygraph mode, in order to
+# keeping the code compatible, here we also skip inplace check in new dygraph temporarily,
+# and this will be fixed in the futrue.
+inplace_check_blacklist = set(["assign_out_"])
+
 
 ###########
 ## Utils ##
@@ -337,15 +345,25 @@ class {} : public egr::GradNodeBase {{
 
 CREATE_PLAIN_OPTIONAL_TENSOR_TEMPLATE = \
 """
-  paddle::optional<const paddle::experimental::Tensor&> {}_optional = paddle::none;
-  if({}.initialized()) {}_optional = paddle::make_optional<const paddle::experimental::Tensor&>({});
+  paddle::optional<paddle::experimental::Tensor> {}_optional;
+  if({}.initialized()) {}_optional = paddle::make_optional<paddle::experimental::Tensor>({});
 """
 
 CREATE_RECOVER_OPTIONAL_TENSOR_TEMPLATE = \
 """
-  paddle::optional<const paddle::experimental::Tensor&> {}_optional = paddle::none;
-  if( {}.impl() ) {}_optional = paddle::make_optional<const paddle::experimental::Tensor&>({});
+  paddle::optional<paddle::experimental::Tensor> {}_optional;
+  if( {}.impl() ) {}_optional = paddle::make_optional<paddle::experimental::Tensor>({});
+"""
+
+CHECK_BACKWARD_INPLACE_TEMPLATE = \
 """
+  bool can_be_inplaced = false;
+  if ({}.initialized()) {{
+    VLOG(10) << {}.name() << "({}) use_count: " << {}.impl().use_count();
+    if ({}.impl().use_count() == 1 || ({}.impl().use_count() == 2 && {}.impl().get() == {}.impl().get())) {{
+      can_be_inplaced = true;
+    }}
+  }}"""
 
 CHECK_NAN_AND_INF_TEMPLATE = \
 """  if (FLAGS_check_nan_inf) {{ egr::CheckTensorHasNanOrInf("{}", {}); }}
@@ -407,7 +425,7 @@ def __init__(self, forward_api_contents, grad_api_contents, namespace):
         #self.optional_inputs
         #self.no_need_buffers
         #self.intermediate_outputs
-        #self.inplace_map
+        #self.forward_inplace_map
         FunctionGeneratorBase.__init__(self, forward_api_contents, namespace)
 
         self.grad_api_contents = grad_api_contents
@@ -438,6 +456,15 @@ def __init__(self, forward_api_contents, grad_api_contents, namespace):
         self.backward_grad_outputs_map = {
         }  #{ "name" : [type, fwd_position, orig_position] ...}
 
+        self.backward_inplace_map = {}  #{name : name, ...}
+
+    def ParseBackwardInplaceInfo(self):
+        grad_api_contents = self.grad_api_contents
+        if 'inplace' not in grad_api_contents.keys(): return
+
+        inplace_map_str = grad_api_contents['inplace']
+        self.backward_inplace_map = ParseYamlInplaceInfo(inplace_map_str)
+
     def DygraphYamlValidationCheck(self):
         forward_api_contents = self.forward_api_contents
         grad_api_contents = self.grad_api_contents
@@ -686,7 +713,7 @@ def GenerateNodeCreationCodes(self):
 
             if is_fwd_input:
                 if is_optional:
-                    set_tensor_wrappers = f"{indent}if({name}.get_ptr() != nullptr) grad_node->SetTensorWrapper{name}(*({name}.get_ptr()));"
+                    set_tensor_wrappers = f"{indent}if({name}) grad_node->SetTensorWrapper{name}(*{name});"
                 else:
                     set_tensor_wrappers = f"{indent}grad_node->SetTensorWrapper{name}({name});"
                 set_input_tensor_wrappers_list.append(set_tensor_wrappers)
@@ -697,7 +724,7 @@ def GenerateNodeCreationCodes(self):
                     ), AssertMessage(name, forward_outputs_position_map.keys())
 
                 if is_optional:
-                    set_tensor_wrappers = f"{indent}if({name}.get_ptr() != nullptr) grad_node->SetTensorWrapper{name}(*({name}.get_ptr()));"
+                    set_tensor_wrappers = f"{indent}if({name}) grad_node->SetTensorWrapper{name}(*{name});"
                 else:
                     set_tensor_wrappers = f"{indent}grad_node->SetTensorWrapper{name}({name});"
                 set_output_tensor_wrappers_list.append(set_tensor_wrappers)
@@ -777,8 +804,9 @@ def run(self):
         ##########################
         ## Parsing Raw Contents ##
         ##########################
-        # Parse inplace_map
-        self.ParseInplaceInfo()
+        # Parse forward and backward inplace_map
+        self.ParseForwardInplaceInfo()
+        self.ParseBackwardInplaceInfo()
 
         # Parse no_need_buffer
         self.ParseNoNeedBuffer()
@@ -827,17 +855,19 @@ def __init__(self, forward_api_contents, grad_api_contents, namespace):
 
     def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
         namespace = self.namespace
-
+        if self.forward_api_name[-1] == '_' and not is_inplaced:
+            return
         forward_api_name = GetInplacedFunctionName(
             self.forward_api_name) if is_inplaced else self.forward_api_name
 
         forward_inputs_position_map = self.forward_inputs_position_map
         forward_outputs_position_map = self.forward_outputs_position_map
         forward_attrs_list = self.forward_attrs_list
+        backward_grad_outputs_map = self.backward_grad_outputs_map
 
         optional_inputs = self.optional_inputs
         intermediate_outputs = self.intermediate_outputs
-        inplace_map = self.inplace_map if is_inplaced else {}
+        forward_inplace_map = self.forward_inplace_map if is_inplaced else {}
         indent = GetIndent(1)
 
         # Get Function Args
@@ -858,18 +888,15 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
             is_optional = (name in optional_inputs)
             if IsPlainTensorType(ttype):
                 if is_optional:
-                    arg_str = f"const paddle::optional<const paddle::experimental::Tensor&> {name}"
+                    arg_str = f"const paddle::optional<paddle::experimental::Tensor>& {name}"
                     amp_tensors_vector_optional_list.append(
-                        f"if ({name}.get_ptr() != nullptr) amp_tensors_vector.push_back({{ *({name}.get_ptr()) }});\n"
-                    )
-                    amp_autocast_optional_list.append(
-                        f"auto NEW_{name}_temp_tensor = ({name}.get_ptr() != nullptr) ? egr::EagerAmpAutoCast(\"{name}\", *({name}.get_ptr()), amp_dst_dtype, op_name) : paddle::experimental::Tensor();\n"
+                        f"if ({name}) amp_tensors_vector.push_back({{ *{name} }});\n"
                     )
                     amp_autocast_optional_list.append(
-                        f"auto NEW_{name} = ({name}.get_ptr() != nullptr) ? paddle::make_optional<const paddle::experimental::Tensor&>(NEW_{name}_temp_tensor) : {name};\n"
+                        f"auto NEW_{name} = egr::EagerAmpAutoCast(\"{name}\", {name}, amp_dst_dtype, op_name);\n"
                     )
                 else:
-                    if is_inplaced and inplace_map and name in inplace_map.keys(
+                    if is_inplaced and forward_inplace_map and name in forward_inplace_map.keys(
                     ):
                         arg_str = f"paddle::experimental::Tensor& {name}"
                         amp_tensors_vector_list.append(f"{{{name}}}")
@@ -944,13 +971,15 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
             returns_list[pos] = f"{name}"
 
             if IsPlainTensorType(rtype):
-                if is_inplaced and inplace_map and name in inplace_map.values():
+                if is_inplaced and forward_inplace_map and name in forward_inplace_map.values(
+                ):
                     returns_type_list[pos] = "paddle::experimental::Tensor&"
                 else:
                     returns_type_list[pos] = "paddle::experimental::Tensor"
             else:
                 assert IsVectorTensorType(rtype)
-                if is_inplaced and inplace_map and name in inplace_map.values():
+                if is_inplaced and forward_inplace_map and name in forward_inplace_map.values(
+                ):
                     returns_type_list[
                         pos] = "std::vector<paddle::experimental::Tensor>&"
                 else:
@@ -971,17 +1000,26 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
         inputs_autograd_meta_list = []
         compute_require_grad_args_list = ["trace_backward"]
         for name, (ttype, pos) in forward_inputs_position_map.items():
-            input_autograd_meta_name = GetAutoGradMetaName(name)
-            if IsPlainTensorType(ttype):
-                input_autograd_meta = f"{indent}egr::AutogradMeta* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({name});"
-            else:
-                assert IsVectorTensorType(ttype)
-                input_autograd_meta_vec_name = GetAutoGradMetaVectorName(name)
-                input_autograd_meta = f"{indent}std::vector<egr::AutogradMeta*> {input_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta({name});\n"
-                input_autograd_meta += f"{indent}std::vector<egr::AutogradMeta*>* {input_autograd_meta_name} = &{input_autograd_meta_vec_name};"
-
-            inputs_autograd_meta_list.append(input_autograd_meta)
-            compute_require_grad_args_list.append(input_autograd_meta_name)
+            # Has corresponding grad output
+            has_corresponding_grad_output = False
+            for _, (_, corresponding_pos,
+                    _) in backward_grad_outputs_map.items():
+                if pos == corresponding_pos:
+                    has_corresponding_grad_output = True
+            if has_corresponding_grad_output or (
+                    name in forward_inplace_map and
+                    forward_api_name not in inplace_check_blacklist):
+                input_autograd_meta_name = GetAutoGradMetaName(name)
+                if IsPlainTensorType(ttype):
+                    input_autograd_meta = f"{indent}egr::AutogradMeta* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({name});"
+                else:
+                    assert IsVectorTensorType(ttype)
+                    input_autograd_meta_vec_name = GetAutoGradMetaVectorName(
+                        name)
+                    input_autograd_meta = f"{indent}std::vector<egr::AutogradMeta*> {input_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta({name});\n"
+                    input_autograd_meta += f"{indent}std::vector<egr::AutogradMeta*>* {input_autograd_meta_name} = &{input_autograd_meta_vec_name};"
+                inputs_autograd_meta_list.append(input_autograd_meta)
+                compute_require_grad_args_list.append(input_autograd_meta_name)
         inputs_autograd_meta_str = "\n".join(inputs_autograd_meta_list)
         compute_require_grad_args_str = ",".join(compute_require_grad_args_list)
 
@@ -1014,10 +1052,12 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
         check_inplace_str = ""
         bump_inplace_version_str = ""
         if is_inplaced:
-            for inplace_name in inplace_map.keys():
-                inplace_autograd_meta_name = GetAutoGradMetaName(inplace_name)
-                check_inplace_str += CHECK_INPLACE_TEMPLATE.format(
-                    inplace_name, inplace_autograd_meta_name)
+            for inplace_name in forward_inplace_map.keys():
+                if forward_api_name not in inplace_check_blacklist:
+                    inplace_autograd_meta_name = GetAutoGradMetaName(
+                        inplace_name)
+                    check_inplace_str += CHECK_INPLACE_TEMPLATE.format(
+                        inplace_name, inplace_autograd_meta_name)
                 bump_inplace_version_str += BUMP_INPLACE_VERSION_TEMPLATE.format(
                     inplace_name, inplace_name)
 
@@ -1258,6 +1298,7 @@ def GenerateNodeDefinition(self, next_grad_node_creation_str,
         backward_grad_inputs_map = self.backward_grad_inputs_map
         backward_grad_outputs_map = self.backward_grad_outputs_map
         backward_attrs_list = self.backward_attrs_list
+        backward_inplace_map = self.backward_inplace_map
         indent = GetIndent(1)
 
         # Construct grad_api function args
@@ -1282,6 +1323,7 @@ def GenerateNodeDefinition(self, next_grad_node_creation_str,
                     else:
                         fill_zero_str += f"{indent}egr::EagerUtils::FillZeroForEmptyGradInput(&grads[{fwd_position}], input_metas[{fwd_position}]);\n"
 
+        inplace_grad_input_str = ""
         # Grad Ins from TensorWrappers
         for name, (_, is_fwd_input,
                    grad_api_position), in backward_forward_inputs_map.items():
@@ -1290,6 +1332,14 @@ def GenerateNodeDefinition(self, next_grad_node_creation_str,
 
             is_optional = (name in self.optional_inputs)
             tensor_wrapper_recover_str = f"{indent}auto {transformed_tensor_name} = egr::EagerUtils::RecoverTensorWrapper(&this->{tensor_wrapper_name});"
+            if backward_inplace_map and name in backward_inplace_map.keys():
+                tensor_wrapper_intermidiate_tensor_str = f"(&this->{tensor_wrapper_name})->get_intermidiate_tensor()"
+                tensor_wrapper_recover_str += CHECK_BACKWARD_INPLACE_TEMPLATE.format(
+                    transformed_tensor_name, transformed_tensor_name, name,
+                    transformed_tensor_name, transformed_tensor_name,
+                    transformed_tensor_name, transformed_tensor_name,
+                    tensor_wrapper_intermidiate_tensor_str)
+                inplace_grad_input_str = transformed_tensor_name
             if is_optional:
                 tensor_wrapper_recover_str += "\n" + CREATE_RECOVER_OPTIONAL_TENSOR_TEMPLATE.format(
                     transformed_tensor_name, transformed_tensor_name,
@@ -1312,6 +1362,16 @@ def GenerateNodeDefinition(self, next_grad_node_creation_str,
             if IsPlainTensorType(ttype):
                 get_tensor_str = f"{indent}auto& {transformed_tensor_name} = hooked_grads[{fwd_position}][0];"
 
+                # Inplace in backward op
+                if backward_inplace_map and name in backward_inplace_map.keys():
+                    grads_tensor_str = f"grads[{fwd_position}][0]"
+                    get_tensor_str += CHECK_BACKWARD_INPLACE_TEMPLATE.format(
+                        transformed_tensor_name, transformed_tensor_name, name,
+                        transformed_tensor_name, transformed_tensor_name,
+                        transformed_tensor_name, transformed_tensor_name,
+                        grads_tensor_str)
+                    inplace_grad_input_str = transformed_tensor_name
+
                 if is_optional:
                     get_tensor_str += "\n" + CREATE_PLAIN_OPTIONAL_TENSOR_TEMPLATE.format(
                         transformed_tensor_name, transformed_tensor_name,
@@ -1357,8 +1417,16 @@ def GenerateNodeDefinition(self, next_grad_node_creation_str,
             grad_api_args.append(f"api_output_{out_index}")
 
             if IsPlainTensorType(ttype):
+                inplace_for_grad_outs_str = ""
+                if backward_inplace_map and name in backward_inplace_map.values(
+                ):
+                    inplace_for_grad_outs_str = f"""
+{indent}if (api_output_{out_index} != nullptr && can_be_inplaced) {{
+{indent}  egr::EagerUtils::HandleViewBetweenInputAndOutput({inplace_grad_input_str}, api_output_{out_index});
+{indent}}}"""
+
                 grad_function_call_str += f"""
-  auto* api_output_{out_index} = (out_metas[{fwd_position}].empty() || out_metas[{fwd_position}][0].IsStopGradient()) ? nullptr : &returns[{fwd_position}][0];"""
+  auto* api_output_{out_index} = (out_metas[{fwd_position}].empty() || out_metas[{fwd_position}][0].IsStopGradient()) ? nullptr : &returns[{fwd_position}][0];{inplace_for_grad_outs_str}"""
 
             else:
                 assert IsVectorTensorType(ttype)
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
index 7672e49f368ce..c02400299dfa6 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
@@ -259,7 +259,7 @@ def __init__(self, forward_api_contents, namespace):
         #self.optional_inputs
         #self.no_need_buffers
         #self.intermediate_outputs   
-        #self.inplace_map
+        #self.forward_inplace_map
         FunctionGeneratorBase.__init__(self, forward_api_contents, namespace)
 
         self.is_forward_only = True
@@ -275,7 +275,7 @@ def CollectIsForwardOnly(self):
 
     def GeneratePythonCFunction(self):
         namespace = self.namespace
-        inplace_map = self.inplace_map
+        forward_inplace_map = self.forward_inplace_map
         forward_api_name = self.forward_api_name
         orig_forward_attrs_list = self.orig_forward_attrs_list
         forward_inputs_position_map = self.forward_inputs_position_map
@@ -359,7 +359,7 @@ def GeneratePythonCFunction(self):
             forward_api_name_prefix, forward_api_name, namespace,
             forward_api_name, forward_api_name)
 
-        if inplace_map:
+        if forward_inplace_map:
             inplaced_forward_api_name = GetInplacedFunctionName(
                 self.forward_api_name)
             if is_forward_only:
@@ -372,16 +372,16 @@ def GeneratePythonCFunction(self):
                     GetForwardFunctionName(inplaced_forward_api_name))
 
             assert len(
-                inplace_map
-            ) == 1, f"size of inplace_map must be 1, but inplace_map of \"{forward_api_name}\" op got {len(inplace_map)}"
-            for inplace_input, inplace_output in inplace_map.items():
+                forward_inplace_map
+            ) == 1, f"size of inplace_map must be 1, but inplace_map of \"{forward_api_name}\" op got {len(forward_inplace_map)}"
+            for inplace_input, inplace_output in forward_inplace_map.items():
                 return_str = RETURN_INPLACE_PYOBJECT_TEMPLATE.format(
                     inplaced_forward_api_name, inplace_input,
                     inplaced_forward_api_name, inplace_output)
                 break
 
             # Generate Python-C Function Definetion
-            self.python_c_function_str += PYTHON_C_FUNCTION_TEMPLATE.format(
+            python_c_inplace_func_str = PYTHON_C_FUNCTION_TEMPLATE.format(
                 inplaced_forward_api_name, pythonc_record_event_str,
                 inplaced_forward_api_name, get_eager_tensor_str,
                 parse_attributes_str, set_device_str,
@@ -389,11 +389,20 @@ def GeneratePythonCFunction(self):
                 inplaced_fwd_function_name, dygraph_function_call_str,
                 return_str)
 
-            # Generate Python-C Function Registration
-            self.python_c_function_reg_str += "\n," + PYTHON_C_FUNCTION_REG_TEMPLATE.format(
+            python_c_inplace_func_reg_str = PYTHON_C_FUNCTION_REG_TEMPLATE.format(
                 forward_api_name_prefix, inplaced_forward_api_name, namespace,
                 inplaced_forward_api_name, inplaced_forward_api_name)
 
+            # self.forward_api_name ending with '_' means it only has inplace api
+            if self.forward_api_name[-1] == '_':
+                self.python_c_function_str = python_c_inplace_func_str
+                # Generate Python-C Function Registration
+                self.python_c_function_reg_str = python_c_inplace_func_reg_str
+            else:
+                self.python_c_function_str += python_c_inplace_func_str
+                # Generate Python-C Function Registration
+                self.python_c_function_reg_str += "\n," + python_c_inplace_func_reg_str
+
     def run(self):
         # Initialized is_forward_only
         self.CollectIsForwardOnly()
@@ -401,8 +410,8 @@ def run(self):
         # Initialized optional_inputs
         self.ParseDispensable()
 
-        # Initialized inplace_map
-        self.ParseInplaceInfo()
+        # Initialized forward_inplace_map
+        self.ParseForwardInplaceInfo()
 
         # Initialized orig_forward_inputs_list, orig_forward_returns_list, orig_forward_attrs_list
         self.CollectOriginalForwardInfo()
diff --git a/paddle/fluid/eager/eager_amp_auto_cast.h b/paddle/fluid/eager/eager_amp_auto_cast.h
index ee9da41881b2d..c3fe3551ccb28 100644
--- a/paddle/fluid/eager/eager_amp_auto_cast.h
+++ b/paddle/fluid/eager/eager_amp_auto_cast.h
@@ -60,7 +60,8 @@ inline std::vector<paddle::experimental::Tensor> EagerAmpAutoCasts(
 
 inline paddle::experimental::Tensor EagerAmpAutoCast(
     const std::string& input_name, const paddle::experimental::Tensor& input,
-    const paddle::experimental::DataType& dst_dtype, std::string op_name) {
+    const paddle::experimental::DataType& dst_dtype,
+    const std::string& op_name) {
   VLOG(6) << "AMP AmpAutoCasts:"
           << " input(" << input_name << ") dst_dtype("
           << paddle::framework::DataType2String(dst_dtype) << ").";
@@ -87,4 +88,15 @@ inline paddle::experimental::Tensor EagerAmpAutoCast(
   return input;
 }
 
+inline paddle::optional<paddle::experimental::Tensor> EagerAmpAutoCast(
+    const std::string& input_name,
+    const paddle::optional<paddle::experimental::Tensor>& input,
+    const paddle::experimental::DataType& dst_dtype,
+    const std::string& op_name) {
+  if (input) {
+    return EagerAmpAutoCast(input_name, *input, dst_dtype, op_name);
+  }
+  return paddle::none;
+}
+
 }  // namespace egr
diff --git a/paddle/fluid/eager/nan_inf_utils.cc b/paddle/fluid/eager/nan_inf_utils.cc
index d676955016684..d1c5983a3702f 100644
--- a/paddle/fluid/eager/nan_inf_utils.cc
+++ b/paddle/fluid/eager/nan_inf_utils.cc
@@ -110,4 +110,10 @@ void CheckTensorHasNanOrInf(
   }
 }
 
+void CheckTensorHasNanOrInf(const std::string& api_name,
+                            const TupleOfTensorAndVector& tensors) {
+  CheckTensorHasNanOrInf(api_name, std::get<0>(tensors));
+  CheckTensorHasNanOrInf(api_name, std::get<1>(tensors));
+}
+
 }  // namespace egr
diff --git a/paddle/fluid/eager/nan_inf_utils.h b/paddle/fluid/eager/nan_inf_utils.h
index 5309eeb2959dc..a411504fa4900 100644
--- a/paddle/fluid/eager/nan_inf_utils.h
+++ b/paddle/fluid/eager/nan_inf_utils.h
@@ -31,6 +31,7 @@ using TupleOfFourTensors = std::tuple<Tensor, Tensor, Tensor, Tensor>;
 using TupleOfFiveTensors = std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor>;
 using TupleOfSixTensors =
     std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor>;
+using TupleOfTensorAndVector = std::tuple<Tensor, std::vector<Tensor>>;
 
 void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor);
 
@@ -52,6 +53,9 @@ void CheckTensorHasNanOrInf(const std::string& api_name,
 void CheckTensorHasNanOrInf(const std::string& api_name,
                             const std::vector<Tensor>& tensors);
 
+void CheckTensorHasNanOrInf(const std::string& api_name,
+                            const TupleOfTensorAndVector& tensors);
+
 void CheckTensorHasNanOrInf(
     const std::string& api_name,
     const paddle::small_vector<std::vector<paddle::experimental::Tensor>,
diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h
index 8893e0ed7ee0a..495f7f2e42c59 100644
--- a/paddle/fluid/eager/tensor_wrapper.h
+++ b/paddle/fluid/eager/tensor_wrapper.h
@@ -116,6 +116,10 @@ class TensorWrapper {
     return recovered_tensor;
   }
 
+  paddle::experimental::Tensor get_intermidiate_tensor() {
+    return intermidiate_tensor_;
+  }
+
   void clear() { intermidiate_tensor_.reset(); }
 
  private:
diff --git a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
index bcb9820419d0f..551262d259e08 100644
--- a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
@@ -250,7 +250,7 @@ TEST(EagerUtils, GetGradAccumulationNode) {
   ASSERT_ANY_THROW(egr::EagerUtils::GetGradAccumulationNode(t0));
 }
 
-TEST(EagerUtils, FillZeroForEmptyGradInputs) {
+TEST(EagerUtils, FillZeroForEmptyOptionalGradInput) {
   paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                        egr::kSlotSmallVectorSize>
       grads = {std::vector<paddle::experimental::Tensor>(1)};
@@ -263,7 +263,7 @@ TEST(EagerUtils, FillZeroForEmptyGradInputs) {
   slot_metas[0][0].SetTensorMeta(tensor_meta);
   slot_metas[0][0].SetPlace(phi::CPUPlace());
 
-  EagerUtils::FillZeroForEmptyGradInputs(&grads, slot_metas);
+  EagerUtils::FillZeroForEmptyOptionalGradInput(&grads[0], slot_metas[0]);
   eager_test::CompareTensorWithValue<float>(grads[0][0], 0.0);
 }
 
diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h
index fe1cdefb7d572..5a730e4dbf164 100644
--- a/paddle/fluid/eager/to_static/run_program_op_node.h
+++ b/paddle/fluid/eager/to_static/run_program_op_node.h
@@ -379,8 +379,8 @@ class GradNodeRunProgram : public egr::GradNodeBase {
                           "The hooked_grads.size() of RunProgramGradOp should "
                           "be equal to 1."));
 
-    egr::EagerUtils::FillZeroForEmptyGradInputs(&hooked_grads,
-                                                this->InputMeta());
+    egr::EagerUtils::FillZeroForEmptyOptionalGradInput(&hooked_grads[0],
+                                                       this->InputMeta()[0]);
     VLOG(3) << "hooked_grads[0].size() : " << hooked_grads[0].size();
     std::vector<paddle::experimental::Tensor> x_grad;
     std::vector<paddle::experimental::Tensor> params_grad;
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index f253c4cb51380..7d9554c52eb6c 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -73,7 +73,7 @@ AutogradMeta* EagerUtils::nullable_autograd_meta(
 }
 
 AutogradMeta* EagerUtils::nullable_autograd_meta(
-    paddle::optional<const paddle::experimental::Tensor&> target) {
+    const paddle::optional<paddle::experimental::Tensor>& target) {
   if (target.get_ptr() != nullptr) {
     return EagerUtils::nullable_autograd_meta(*(target.get_ptr()));
   }
@@ -271,6 +271,33 @@ void EagerUtils::HandleViewBetweenInputAndOutput(
   }
 }
 
+void EagerUtils::HandleViewBetweenInputAndOutput(
+    const paddle::experimental::Tensor& input_tensor,
+    paddle::experimental::Tensor* view_output_tensor) {
+  PADDLE_ENFORCE_EQ(
+      input_tensor.initialized(), true,
+      paddle::platform::errors::InvalidArgument(
+          "Tensor %s has not been initialized!", input_tensor.name()));
+
+  if (input_tensor.is_dense_tensor()) {
+    auto input_dense_tensor =
+        std::dynamic_pointer_cast<phi::DenseTensor>(input_tensor.impl());
+    if (view_output_tensor->impl() == nullptr) {
+      view_output_tensor->set_impl(std::make_shared<phi::DenseTensor>());
+    }
+    auto view_output_dense_tensor =
+        std::dynamic_pointer_cast<phi::DenseTensor>(view_output_tensor->impl());
+    view_output_dense_tensor->ShareBufferWith(*input_dense_tensor);
+    view_output_dense_tensor->ShareInplaceVersionCounterWith(
+        *input_dense_tensor);
+
+    VLOG(3) << "Perform View between Output Tensor("
+            << view_output_tensor->name() << ") and Input Tensor("
+            << input_tensor.name()
+            << "), share allocation and inplace version.";
+  }
+}
+
 std::vector<paddle::experimental::Tensor> EagerUtils::GetOutputs(
     const std::vector<std::shared_ptr<EagerVariable>>& outs) {
   std::vector<paddle::experimental::Tensor> res;
@@ -440,26 +467,16 @@ std::shared_ptr<egr::GradNodeBase> EagerUtils::GetGradAccumulationNode(
   }
 }
 
-void EagerUtils::FillZeroForEmptyGradInputs(
-    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
-                         kSlotSmallVectorSize>* in_grads,
-    const paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>&
-        grad_in_metas) {
+void EagerUtils::FillZeroForEmptyOptionalGradInput(
+    std::vector<paddle::experimental::Tensor>* in_grads,
+    const std::vector<GradSlotMeta>& grad_in_metas) {
   for (size_t i = 0; i < in_grads->size(); i++) {
-    for (size_t j = 0; j < (*in_grads)[i].size(); j++) {
-      paddle::experimental::Tensor& grad = (*in_grads)[i][j];
-      if (!grad.initialized()) {
-        const GradSlotMeta& grad_in_meta = grad_in_metas[i][j];
-        PADDLE_ENFORCE(
-            grad_in_meta.HasTensorMeta(),
-            paddle::platform::errors::Fatal(
-                "Unable to fill empty grad inputs due to empty GradSlotMeta"));
-        const auto& tensor_meta = grad_in_meta.GetTensorMeta();
-        auto tensor_with_zero = paddle::experimental::full(
-            phi::vectorize(tensor_meta.dims), 0.0, tensor_meta.dtype,
-            grad_in_meta.GetPlace());
-        grad.set_impl(tensor_with_zero.impl());
-      }
+    paddle::experimental::Tensor& grad = (*in_grads)[i];
+    if (!grad.initialized() && grad_in_metas[i].HasTensorMeta()) {
+      auto tensor_with_zero = paddle::experimental::full(
+          phi::vectorize(grad_in_metas[i].GetTensorMeta().dims), 0.0,
+          grad_in_metas[i].GetTensorMeta().dtype, grad_in_metas[i].GetPlace());
+      grad.set_impl(tensor_with_zero.impl());
     }
   }
 }
diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h
index b96244f0d138b..c6389e998315c 100644
--- a/paddle/fluid/eager/utils.h
+++ b/paddle/fluid/eager/utils.h
@@ -125,7 +125,7 @@ class EagerUtils {
   static AutogradMeta* nullable_autograd_meta(
       const paddle::experimental::Tensor& target);
   static AutogradMeta* nullable_autograd_meta(
-      paddle::optional<const paddle::experimental::Tensor&> target);
+      const paddle::optional<paddle::experimental::Tensor>& target);
   static std::vector<AutogradMeta*> nullable_autograd_meta(
       const std::vector<paddle::experimental::Tensor>& targets);
   static std::vector<AutogradMeta*> nullable_autograd_meta(
@@ -172,6 +172,9 @@ class EagerUtils {
   static void HandleViewBetweenInputAndOutput(
       const std::shared_ptr<EagerVariable>& input_var,
       const std::shared_ptr<EagerVariable>& view_output_var);
+  static void HandleViewBetweenInputAndOutput(
+      const paddle::experimental::Tensor& input_tensor,
+      paddle::experimental::Tensor* view_output_tensor);
 
   // TensorWrapper Utils
   static paddle::experimental::Tensor RecoverTensorWrapper(TensorWrapper* tw);
@@ -233,11 +236,9 @@ class EagerUtils {
   /**
     * Fill Zero
     * **/
-  static void FillZeroForEmptyGradInputs(
-      paddle::small_vector<std::vector<paddle::experimental::Tensor>,
-                           kSlotSmallVectorSize>* out_grads,
-      const paddle::small_vector<std::vector<GradSlotMeta>,
-                                 kSlotSmallVectorSize>& grad_out_metas);
+  static void FillZeroForEmptyOptionalGradInput(
+      std::vector<paddle::experimental::Tensor>* in_grads,
+      const std::vector<GradSlotMeta>& grad_in_metas);
   static void FillZeroForEmptyGradInput(paddle::experimental::Tensor* in_grad,
                                         const GradSlotMeta& grad_in_meta);
   static void FillZeroForEmptyOptionalGradInput(
diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc
index de563330d68e9..0c762ab2e77e5 100644
--- a/paddle/fluid/framework/data_set.cc
+++ b/paddle/fluid/framework/data_set.cc
@@ -320,12 +320,11 @@ static int compute_thread_batch_nccl(
   thread_avg_batch_num = static_cast<int>(offset.size() / thr_num);
 #ifdef PADDLE_WITH_GLOO
   auto gloo_wrapper = paddle::framework::GlooWrapper::GetInstance();
-  if (!gloo_wrapper->IsInitialized()) {
-    VLOG(0) << "GLOO is not inited";
-    gloo_wrapper->Init();
-  }
-
   if (gloo_wrapper->Size() > 1) {
+    if (!gloo_wrapper->IsInitialized()) {
+      VLOG(0) << "GLOO is not inited";
+      gloo_wrapper->Init();
+    }
     // adjust batch num per thread for NCCL
     std::vector<int> thread_avg_batch_num_vec(1, thread_avg_batch_num);
     std::vector<int64_t> total_instance_num_vec(1, total_instance_num);
diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h
index 11217b6c485fc..823b60c5ef1f2 100644
--- a/paddle/fluid/framework/fleet/heter_context.h
+++ b/paddle/fluid/framework/fleet/heter_context.h
@@ -95,24 +95,6 @@ class HeterContext {
   }
   void SetShardNum(uint32_t shard_num) { shard_num_ = shard_num; }
   uint32_t ShardNum() { return shard_num_; }
-  void init(int shard_num, int device_num) {
-    shard_num_ = shard_num;
-    feature_keys_.resize(shard_num_);
-    value_ptr_.resize(shard_num_);
-    device_task_ptr_.resize(shard_num_);
-    device_task_keys_.resize(shard_num_);
-    for (size_t i = 0; i < device_task_ptr_.size(); i++) {
-      device_task_ptr_[i].resize(device_num);
-      device_task_keys_[i].resize(device_num);
-    }
-
-    device_values_.resize(device_num);
-    device_keys_.resize(device_num);
-    mutex_.resize(device_num);
-    for (size_t i = 0; i < mutex_.size(); ++i) {
-      mutex_[i] = new std::mutex();
-    }
-  }
 
   void init(int shard_num, int device_num, int dim_num) {
     shard_num_ = shard_num;
diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.h b/paddle/fluid/framework/fleet/heter_ps/feature_value.h
index 682c4568cb7e1..cb7f3a40d6720 100644
--- a/paddle/fluid/framework/fleet/heter_ps/feature_value.h
+++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.h
@@ -69,6 +69,20 @@ struct FeaturePushValue {
   int mf_dim;
   float mf_g[0];
 
+  __device__ __forceinline__ FeaturePushValue
+  operator+(const FeaturePushValue& a) const {
+    FeaturePushValue out;
+    out.slot = a.slot;
+    out.mf_dim = a.mf_dim;
+    out.show = a.show + show;
+    out.clk = a.clk + clk;
+    out.lr_g = a.lr_g + lr_g;
+    // out.mf_g = a.mf_g;
+    for (int i = 0; i < out.mf_dim; ++i) {
+      out.mf_g[i] = a.mf_g[i] + mf_g[i];
+    }
+    return out;
+  }
   __device__ __forceinline__ void operator=(const FeaturePushValue& in) {
     show = in.show;
     clk = in.clk;
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
index 32dbd98992b5d..57741c2c19b1c 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
@@ -86,13 +86,43 @@ __global__ void dy_mf_search_kernel(Table* table,
                                     char* vals, size_t len,
                                     size_t pull_feature_value_size) {
   const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  // return;
   if (i < len) {
     auto it = table->find(keys[i]);
 
     if (it != table->end()) {
       uint64_t offset = i * pull_feature_value_size;
-      FeatureValue& cur = *(FeatureValue*)(vals + offset);
+      FeatureValue* cur = (FeatureValue*)(vals + offset);
       FeatureValue& input = *(FeatureValue*)(it->second);
+      cur->slot = input.slot;
+      cur->show = input.show;
+      cur->clk = input.clk;
+      cur->mf_dim = input.mf_dim;
+      cur->lr = input.lr;
+      cur->mf_size = input.mf_size;
+      cur->cpu_ptr = input.cpu_ptr;
+      cur->delta_score = input.delta_score;
+      cur->lr_g2sum = input.lr_g2sum;
+      for (int j = 0; j < cur->mf_dim + 1; ++j) {
+        cur->mf[j] = input.mf[j];
+      }
+    } else {
+      if (keys[i] != 0) {
+        printf("warning::pull miss key: %d", keys[i]);
+      }
+      FeatureValue* cur = (FeatureValue*)(vals + i * pull_feature_value_size);
+      cur->delta_score = 0;
+      cur->show = 0;
+      cur->clk = 0;
+      cur->slot = -1;
+      cur->lr = 0;
+      cur->lr_g2sum = 0;
+      cur->mf_size = 0;
+      cur->mf_dim = 8;
+      cur->cpu_ptr;
+      for (int j = 0; j < cur->mf_dim + 1; j++) {
+        cur->mf[j] = 0;
+      }
     }
   }
 }
@@ -125,7 +155,9 @@ __global__ void dy_mf_update_kernel(Table* table,
       FeaturePushValue* cur = (FeaturePushValue*)(grads + i * grad_value_size);
       sgd.dy_mf_update_value(optimizer_config, (it.getter())->second, *cur);
     } else {
-      printf("warning: push miss key: %d", keys[i]);
+      if (keys[i] != 0) {
+        printf("warning::push miss key: %d", keys[i]);
+      }
     }
   }
 }
@@ -328,6 +360,8 @@ template class HashTable<unsigned long, paddle::framework::FeatureValue*>;
 template class HashTable<long, int>;
 template class HashTable<unsigned long, int>;
 template class HashTable<unsigned long, unsigned long>;
+template class HashTable<unsigned long, long>;
+template class HashTable<unsigned long, long*>;
 template class HashTable<long, long>;
 template class HashTable<long, unsigned long>;
 template class HashTable<long, unsigned int>;
@@ -354,6 +388,8 @@ template void HashTable<long, long>::get<cudaStream_t>(const long* d_keys,
                                                        cudaStream_t stream);
 template void HashTable<long, unsigned int>::get<cudaStream_t>(
     const long* d_keys, unsigned int* d_vals, size_t len, cudaStream_t stream);
+template void HashTable<unsigned long, long>::get<cudaStream_t>(
+    const unsigned long* d_keys, long* d_vals, size_t len, cudaStream_t stream);
 // template void
 // HashTable<unsigned long, paddle::framework::FeatureValue>::get<cudaStream_t>(
 //    const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t
@@ -389,10 +425,9 @@ template void HashTable<long, unsigned int>::insert<cudaStream_t>(
     const long* d_keys, const unsigned int* d_vals, size_t len,
     cudaStream_t stream);
 
-// template void HashTable<unsigned long,
-// paddle::framework::FeatureValue>::insert<
-//    cudaStream_t>(const unsigned long* d_keys, size_t len, char* pool,
-//                  size_t start_index, cudaStream_t stream);
+template void HashTable<unsigned long, long>::insert<cudaStream_t>(
+    const unsigned long* d_keys, const long* d_vals, size_t len,
+    cudaStream_t stream);
 
 template void HashTable<unsigned long, paddle::framework::FeatureValue>::
     dump_to_cpu<cudaStream_t>(int devid, cudaStream_t stream);
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
index 506a0c0b1863f..64b177abb8638 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
@@ -26,6 +26,7 @@ namespace framework {
 template <typename KeyType, typename ValType, typename GradType>
 HeterComm<KeyType, ValType, GradType>::HeterComm(
     size_t capacity, std::shared_ptr<HeterPsResource> resource) {
+  VLOG(1) << "Construct new HeterComm";
   resource_ = resource;
   storage_.resize(resource_->total_device());
   multi_mf_dim_ = resource->multi_mf();
@@ -364,6 +365,10 @@ HeterComm<KeyType, ValType, GradType>::~HeterComm() {
       delete table;
       table = nullptr;
     }
+    for (auto& table : tables_) {
+      delete table;
+      table = nullptr;
+    }
   }
 }
 
@@ -473,17 +478,23 @@ void HeterComm<KeyType, ValType, GradType>::build_ps(int num, KeyType* h_keys,
     return;
   }
   int dev_id = resource_->dev_id(num);
+
   DevPlace place = DevPlace(dev_id);
   AnyDeviceGuard guard(dev_id);
+
+  // use hbm pool
   std::vector<memory::allocation::AllocationPtr> d_key_bufs;
+
   ppStream streams[stream_num];  // NOLINT
   for (int i = 0; i < stream_num; ++i) {
     create_stream(&(streams[i]));
     auto d_k_buf = memory::Alloc(place, chunk_size * sizeof(KeyType));
     d_key_bufs.push_back(std::move(d_k_buf));
   }
+
   int cur_len = 0;
   int cur_stream = 0;
+
   while (cur_len < len) {
     cur_stream = cur_stream % stream_num;
     auto cur_use_stream = streams[cur_stream];
@@ -491,8 +502,10 @@ void HeterComm<KeyType, ValType, GradType>::build_ps(int num, KeyType* h_keys,
     cur_use_stream = 0;
 #endif
     int tmp_len = cur_len + chunk_size > len ? len - cur_len : chunk_size;
+
     auto dst_place = place;
     auto src_place = platform::CPUPlace();
+
     memory_copy(
         dst_place, reinterpret_cast<char*>(d_key_bufs[cur_stream]->ptr()),
         src_place, h_keys + cur_len, sizeof(KeyType) * tmp_len, cur_use_stream);
@@ -557,14 +570,20 @@ void HeterComm<KeyType, ValType, GradType>::dynamic_merge_grad(
   platform::CUDAPlace place = platform::CUDAPlace(dev_id);
   platform::CUDADeviceGuard guard(dev_id);
   auto stream = resource_->local_stream(gpu_num, 0);
+
   size_t temp_storage_bytes;
+
+  // VLOG(1) << "hetercomm merge_grad: max_mf_dim: " << max_mf_dim_;
   size_t grad_value_size =
       TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float)));
+
   auto d_merge_keys = memory::Alloc(place, len * sizeof(KeyType));
   KeyType* d_merge_keys_ptr = reinterpret_cast<KeyType*>(d_merge_keys->ptr());
+
   auto d_merge_grads = memory::Alloc(place, len * grad_value_size);
   GradType* d_merge_grads_ptr =
       reinterpret_cast<GradType*>(d_merge_grads->ptr());
+
   auto d_fea_num_info = memory::Alloc(place, sizeof(uint32_t) * (len * 3 + 1));
   uint32_t* d_fea_num_info_ptr =
       reinterpret_cast<uint32_t*>(d_fea_num_info->ptr());
@@ -836,9 +855,16 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
 
   auto d_shard_keys = memory::Alloc(place, len * sizeof(KeyType));
   KeyType* d_shard_keys_ptr = reinterpret_cast<KeyType*>(d_shard_keys->ptr());
+
   GradType* d_shard_grads_ptr;
-  auto d_shard_grads = memory::Alloc(place, len * grad_value_size);
-  d_shard_grads_ptr = reinterpret_cast<GradType*>(d_shard_grads->ptr());
+  if (!multi_mf_dim_) {
+    auto d_shard_grads = memory::Alloc(place, len * sizeof(GradType));
+    d_shard_grads_ptr = reinterpret_cast<GradType*>(d_shard_grads->ptr());
+  } else {
+    auto d_shard_grads = memory::Alloc(place, len * grad_value_size);
+    d_shard_grads_ptr = reinterpret_cast<GradType*>(d_shard_grads->ptr());
+  }
+
   int uniq_len = len;
   dynamic_merge_grad(dev_num, d_keys, d_grads, len, uniq_len);
 
@@ -846,9 +872,16 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
 
   split_input_to_shard(d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr,
                        dev_num);
-  heter_comm_kernel_->dy_mf_fill_shard_grads(
-      d_shard_keys_ptr, d_keys, d_shard_grads_ptr, d_grads, d_idx_ptr, uniq_len,
-      grad_value_size, stream);
+
+  if (!multi_mf_dim_) {
+    heter_comm_kernel_->fill_shard_grads(d_shard_keys_ptr, d_keys,
+                                         d_shard_grads_ptr, d_grads, d_idx_ptr,
+                                         uniq_len, stream);
+  } else {
+    heter_comm_kernel_->dy_mf_fill_shard_grads(
+        d_shard_keys_ptr, d_keys, d_shard_grads_ptr, d_grads, d_idx_ptr,
+        uniq_len, grad_value_size, stream);
+  }
 
   sync_stream(stream);
 
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
index f44803982a55a..94d7929b2947d 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
@@ -136,6 +136,7 @@ __global__ void merge_gradients_kernel(const uint32_t* offset,
                                        size_t grad_value_size,
                                        DynamicGradMerger& merger_) {
   const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+
   if (i < n) {
     uint32_t start = offset[i];
     uint32_t num = fea_num[i];
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.kps b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.kps
index f73757902fef6..b44ea1807fd65 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.kps
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.kps
@@ -18,6 +18,7 @@ limitations under the License. */
 #if defined(PADDLE_WITH_XPU_KP)
 #include <xpu/runtime.h>
 #include "xpu/kernel/cluster_header.h"
+#include "xpu/kernel/debug.h"  // NOLINT
 #include "xpu/kernel/math.h"
 #include "xpu/kernel/simd.h"
 #endif
@@ -91,7 +92,7 @@ __global__ void calc_shard_offset_kernel(T* idx, T* left, T* right,
     // read batch from GM will boost performance
     int read_len = min(len_per_loop, len - i);
     GM2LM(idx + i, local_idx, read_len * sizeof(T));
-    for (int k = 0; k < read_len; k++) {
+    for (int k = 0; k < read_len - 1; k++) {
       if (local_idx[k] != local_idx[k + 1]) {
         int real_idx = i + k;
         local_right[local_idx[k]] = real_idx;
@@ -102,7 +103,7 @@ __global__ void calc_shard_offset_kernel(T* idx, T* left, T* right,
       local_left[local_idx[i]] = i;
     }
     if (i + read_len == len) {
-      local_right[local_idx[len - 1]] = len - 1;
+      local_right[local_idx[read_len - 1]] = len - 1;
     }
   }
   // to be optimized: call LM2GM too frequently
@@ -150,7 +151,7 @@ __global__ void fill_shard_key_kernel(KeyType* d_shard_keys, KeyType* d_keys,
   int thread_id = ncores * cluster_id() + cid;
   int nthreads = ncores * cluster_num();
   const int buf_size = 400;
-  __local__ KeyType local_keys[buf_size];
+  // __local__ KeyType local_keys[buf_size];
   __local__ KeyType local_shard_keys[buf_size];
   __local__ T local_idx[buf_size];
   int len_per_loop = min(buf_size, roundup_div(len, nthreads));
@@ -158,10 +159,11 @@ __global__ void fill_shard_key_kernel(KeyType* d_shard_keys, KeyType* d_keys,
        i += nthreads * len_per_loop) {
     // read batch from GM will boost performance
     int read_len = min(len_per_loop, len - i);
-    GM2LM(d_keys + i, local_keys, read_len * sizeof(KeyType));
+    // GM2LM(d_keys + i, local_keys, read_len * sizeof(KeyType));
     GM2LM(idx + i, local_idx, read_len * sizeof(T));
     for (int k = 0; k < read_len; k++) {
-      local_shard_keys[k] = local_keys[local_idx[k]];
+      GM2LM(d_keys + local_idx[k], &local_shard_keys[k], 1 * sizeof(KeyType));
+      // local_shard_keys[k] = local_keys[local_idx[k]];
     }
     LM2GM(local_shard_keys, d_shard_keys + i, read_len * sizeof(KeyType));
   }
@@ -181,9 +183,9 @@ __global__ void fill_shard_grads_kernel(KeyType* d_shard_keys, KeyType* d_keys,
   int thread_id = ncores * cluster_id() + cid;
   int nthreads = ncores * cluster_num();
 
-  const int buf_size = 100;
-  __local__ KeyType local_keys[buf_size];
-  __local__ GradType local_grads[buf_size];
+  const int buf_size = 50;
+  // __local__ KeyType local_keys[buf_size];
+  // __local__ GradType local_grads[buf_size];
   __local__ KeyType local_shard_keys[buf_size];
   __local__ GradType local_shard_grads[buf_size];
   __local__ T local_idx[buf_size];
@@ -193,12 +195,15 @@ __global__ void fill_shard_grads_kernel(KeyType* d_shard_keys, KeyType* d_keys,
        i += nthreads * len_per_loop) {
     // read batch from GM will boost performance
     int read_len = min(len_per_loop, len - i);
-    GM2LM(d_keys + i, local_keys, read_len * sizeof(KeyType));
-    GM2LM(d_grads + i, local_grads, read_len * sizeof(GradType));
+    // GM2LM(d_keys + i, local_keys, read_len * sizeof(KeyType));
+    // GM2LM(d_grads + i, local_grads, read_len * sizeof(GradType));
     GM2LM(idx + i, local_idx, read_len * sizeof(T));
     for (int k = 0; k < read_len; k++) {
-      local_shard_keys[k] = local_keys[local_idx[k]];
-      local_shard_grads[k] = local_grads[local_idx[k]];
+      GM2LM(d_keys + local_idx[k], &local_shard_keys[k], 1 * sizeof(KeyType));
+      GM2LM(d_grads + local_idx[k], &local_shard_grads[k],
+            1 * sizeof(GradType));
+      // local_shard_keys[k] = local_keys[local_idx[k]];
+      // local_shard_grads[k] = local_grads[local_idx[k]];
     }
     LM2GM(local_shard_keys, d_shard_keys + i, read_len * sizeof(KeyType));
     LM2GM(local_shard_grads, d_shard_grads + i, read_len * sizeof(GradType));
@@ -227,9 +232,10 @@ __global__ void fill_dvals_kernel(ValType* d_shard_vals, ValType* d_vals,
     GM2LM(idx + i, local_idx, read_len * sizeof(T));
     GM2LM(d_shard_vals + i, local_shard_vals, read_len * sizeof(ValType));
     for (int k = 0; k < read_len; k++) {
-      local_vals[local_idx[k]] = local_shard_vals[k];
+      LM2GM(&local_shard_vals[k], d_vals + local_idx[k], 1 * sizeof(ValType));
+      // local_vals[local_idx[k]] = local_shard_vals[k];
     }
-    LM2GM(local_vals, d_vals + i, read_len * sizeof(ValType));
+    // LM2GM(local_vals, d_vals + i, read_len * sizeof(ValType));
   }
 }
 
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
index a22704bd1ed03..65892f8488475 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -28,11 +28,16 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_HETERPS
 
+#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
+
 #include <algorithm>
 #include <deque>
 
-#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
 #include "paddle/fluid/platform/timer.h"
+#if defined(PADDLE_WITH_PSCORE)
+#include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h"
+#include "paddle/fluid/distributed/ps/table/depends/feature_value.h"
+#endif
 
 namespace paddle {
 namespace framework {
@@ -106,25 +111,17 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
   platform::Timer timeline;
   timeline.Start();
   int device_num = heter_devices_.size();
-  if (!multi_mf_dim_) {
-    gpu_task->init(thread_keys_shard_num_, device_num);
-  } else {
-    gpu_task->init(thread_keys_shard_num_, device_num, multi_mf_dim_);
-  }
+  gpu_task->init(thread_keys_shard_num_, device_num, multi_mf_dim_);
 
   std::vector<std::thread> threads;
-  if (!multi_mf_dim_) {
-    thread_keys_.resize(thread_keys_thread_num_);
-    for (int i = 0; i < thread_keys_thread_num_; i++) {
-      thread_keys_[i].resize(thread_keys_shard_num_);
-    }
-  } else {
-    thread_dim_keys_.resize(thread_keys_thread_num_);
-    for (int i = 0; i < thread_keys_thread_num_; i++) {
-      thread_dim_keys_[i].resize(thread_keys_shard_num_);
-      for (int j = 0; j < thread_keys_shard_num_; j++) {
-        thread_dim_keys_[i][j].resize(multi_mf_dim_);
-      }
+
+  // data should be in input channel
+
+  thread_dim_keys_.resize(thread_keys_thread_num_);
+  for (int i = 0; i < thread_keys_thread_num_; i++) {
+    thread_dim_keys_[i].resize(thread_keys_shard_num_);
+    for (int j = 0; j < thread_keys_shard_num_; j++) {
+      thread_dim_keys_[i][j].resize(multi_mf_dim_);
     }
   }
 
@@ -144,18 +141,6 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
     len_per_thread = total_len / thread_keys_thread_num_;
     remain = total_len % thread_keys_thread_num_;
     VLOG(0) << "total len: " << total_len;
-    auto gen_func = [this](const std::deque<SlotRecord>& total_data,
-                           int begin_index, int end_index, int i) {
-      for (auto iter = total_data.begin() + begin_index;
-           iter != total_data.begin() + end_index; iter++) {
-        const auto& ins = *iter;
-        const auto& feasign_v = ins->slot_uint64_feasigns_.slot_values;
-        for (const auto feasign : feasign_v) {
-          int shard_id = feasign % thread_keys_shard_num_;
-          this->thread_keys_[i][shard_id].insert(feasign);
-        }
-      }
-    };
     auto gen_dynamic_mf_func = [this](const std::deque<SlotRecord>& total_data,
                                       int begin_index, int end_index, int i) {
       for (auto iter = total_data.begin() + begin_index;
@@ -177,17 +162,10 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
       }
     };
     for (int i = 0; i < thread_keys_thread_num_; i++) {
-      if (!multi_mf_dim_) {
-        VLOG(0) << "yxf::psgpu wrapper genfunc";
-        threads.push_back(
-            std::thread(gen_func, std::ref(vec_data), begin,
-                        begin + len_per_thread + (i < remain ? 1 : 0), i));
-      } else {
-        VLOG(0) << "yxf::psgpu wrapper genfunc with dynamic mf";
-        threads.push_back(
-            std::thread(gen_dynamic_mf_func, std::ref(vec_data), begin,
-                        begin + len_per_thread + (i < remain ? 1 : 0), i));
-      }
+      threads.push_back(
+          std::thread(gen_dynamic_mf_func, std::ref(vec_data), begin,
+                      begin + len_per_thread + (i < remain ? 1 : 0), i));
+
       begin += len_per_thread + (i < remain ? 1 : 0);
     }
     for (std::thread& t : threads) {
@@ -235,12 +213,6 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
 
   threads.clear();
   // merge thread_keys to shard_keys
-  auto merge_ins_func = [this, gpu_task](int shard_num) {
-    for (int i = 0; i < thread_keys_thread_num_; ++i) {
-      gpu_task->batch_add_keys(shard_num, thread_keys_[i][shard_num]);
-      thread_keys_[i][shard_num].clear();
-    }
-  };
   auto merge_ins_dynamic_mf_func = [this, gpu_task](int shard_num, int dim_id) {
     for (int i = 0; i < thread_keys_thread_num_; ++i) {
       gpu_task->batch_add_keys(shard_num, dim_id,
@@ -249,12 +221,8 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
     }
   };
   for (int i = 0; i < thread_keys_shard_num_; ++i) {
-    if (!multi_mf_dim_) {
-      threads.push_back(std::thread(merge_ins_func, i));
-    } else {
-      for (int j = 0; j < multi_mf_dim_; j++) {
-        threads.push_back(std::thread(merge_ins_dynamic_mf_func, i, j));
-      }
+    for (int j = 0; j < multi_mf_dim_; j++) {
+      threads.push_back(std::thread(merge_ins_dynamic_mf_func, i, j));
     }
   }
   for (auto& t : threads) {
@@ -271,9 +239,6 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
   VLOG(0) << "GpuPs task unique cost " << timeline.ElapsedSec() << " seconds.";
   for (int i = 0; i < thread_keys_shard_num_; i++) {
     for (int j = 0; j < multi_mf_dim_; j++) {
-      if (i == 0 && j == multi_mf_dim_ - 1) {
-        gpu_task->feature_dim_keys_[i][j].push_back(0);
-      }
       VLOG(0) << "GpuPs shard: " << i << "mf dim: " << index_dim_vec_[j]
               << " key len: " << gpu_task->feature_dim_keys_[i][j].size();
       gpu_task->value_dim_ptr_[i][j].resize(
@@ -297,12 +262,12 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
   auto& device_dim_keys = gpu_task->device_dim_keys_;
   auto& device_dim_ptr = gpu_task->device_dim_ptr_;
   auto& device_dim_mutex = gpu_task->dim_mutex_;
-  if (multi_mf_dim_) {
-    for (size_t dev = 0; dev < device_dim_keys.size(); dev++) {
-      device_dim_keys[dev].resize(multi_mf_dim_);
-      device_dim_ptr[dev].resize(multi_mf_dim_);
-    }
+
+  for (size_t dev = 0; dev < device_dim_keys.size(); dev++) {
+    device_dim_keys[dev].resize(multi_mf_dim_);
+    device_dim_ptr[dev].resize(multi_mf_dim_);
   }
+
   // auto& device_mutex = gpu_task->mutex_;
 
   std::vector<std::thread> threads(thread_keys_shard_num_);
@@ -329,10 +294,10 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
 
   auto ptl_dynamic_mf_func = [this, &local_dim_keys, &local_dim_ptr,
                               &fleet_ptr](int i, int j) {
-#ifdef PADDLE_WITH_PSLIB
     size_t key_size = local_dim_keys[i][j].size();
     int32_t status = -1;
     int32_t cnt = 0;
+#ifdef PADDLE_WITH_PSLIB
     while (true) {
       auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr(
           i, reinterpret_cast<char**>(local_dim_ptr[i][j].data()),
@@ -362,6 +327,38 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
         break;
       }
     }
+#endif
+#ifdef PADDLE_WITH_PSCORE
+    while (true) {
+      auto tt = fleet_ptr->worker_ptr_->PullSparsePtr(
+          reinterpret_cast<char**>(local_dim_ptr[i][j].data()), this->table_id_,
+          local_dim_keys[i][j].data(), key_size);
+      bool flag = true;
+
+      tt.wait();
+
+      try {
+        status = tt.get();
+      } catch (const std::future_error& e) {
+        VLOG(0) << "Caught a future_error with code" << e.code()
+                << ", Message:" << e.what();
+      }
+      if (status != 0) {
+        VLOG(0) << "fleet pull sparse failed, status[" << status << "]";
+        sleep(sleep_seconds_before_fail_exit_);
+        flag = false;
+        cnt++;
+      }
+      if (cnt > 3) {
+        VLOG(0) << "fleet pull sparse failed, retry 3 times";
+        exit(-1);
+      }
+
+      if (flag) {
+        break;
+      }
+    }
+#endif
     if (status != 0) {
       LOG(ERROR) << "fleet pull sparse failed, status[" << status << "]";
       sleep(300);
@@ -370,7 +367,6 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
       VLOG(0) << "FleetWrapper Pull sparse to local done with table size: "
               << local_dim_keys[i][j].size();
     }
-#endif
   };
 
   threads.resize(thread_keys_shard_num_ * multi_mf_dim_);
@@ -406,15 +402,22 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
                                      &local_dim_ptr, &device_dim_keys,
                                      &device_dim_ptr,
                                      &device_dim_mutex](int i, int j) {
-#ifdef PADDLE_WITH_PSLIB
     std::vector<std::vector<FeatureKey>> task_keys(device_num);
+#ifdef PADDLE_WITH_PSLIB
     std::vector<std::vector<paddle::ps::DownpourFixedFeatureValue*>> task_ptrs(
         device_num);
+#endif
+
+#ifdef PADDLE_WITH_PSCORE
+    std::vector<std::vector<paddle::distributed::FixedFeatureValue*>> task_ptrs(
+        device_num);
+#endif
     for (size_t k = 0; k < local_dim_keys[i][j].size(); k++) {
       int shard = local_dim_keys[i][j][k] % device_num;
       task_keys[shard].push_back(local_dim_keys[i][j][k]);
       task_ptrs[shard].push_back(local_dim_ptr[i][j][k]);
     }
+    // allocate local keys to devices
     for (int dev = 0; dev < device_num; dev++) {
       device_dim_mutex[dev][j]->lock();
       int len = task_keys[dev].size();
@@ -427,7 +430,6 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
       }
       device_dim_mutex[dev][j]->unlock();
     }
-#endif
   };
   auto build_func = [device_num, record_status, &pass_values, &local_keys,
                      &local_ptr, &device_task_keys, &device_task_ptrs](int i) {
@@ -619,6 +621,7 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
             << feature_keys_count[i];
     size_max = std::max(size_max, feature_keys_count[i]);
   }
+
   if (HeterPs_) {
     delete HeterPs_;
     HeterPs_ = nullptr;
@@ -664,10 +667,26 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
       val->lr_g2sum =
           ptr_val[paddle::ps::DownpourCtrDymfAccessor::
                       DownpourCtrDymfFeatureValue::embed_g2sum_index()];
-      val->cpu_ptr = (uint64_t)(device_dim_ptrs[k]);
+      // TODO(xuefeng) set mf_dim while using DownpourCtrDymfAccessor
       ptr_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
                   mf_dim_index()] = float(mf_dim);
       val->mf_dim = mf_dim;
+#endif
+#ifdef PADDLE_WITH_PSCORE
+      paddle::distributed::CtrDymfAccessor accessor;
+      val->delta_score =
+          ptr_val[accessor.common_feature_value.DeltaScoreIndex()];
+      val->show = ptr_val[accessor.common_feature_value.ShowIndex()];
+      val->clk = ptr_val[accessor.common_feature_value.ClickIndex()];
+      val->slot = int(ptr_val[accessor.common_feature_value.SlotIndex()]);
+      val->lr = ptr_val[accessor.common_feature_value.EmbedWIndex()];
+      val->lr_g2sum = ptr_val[accessor.common_feature_value.EmbedG2SumIndex()];
+
+      val->cpu_ptr = (uint64_t)(device_dim_ptrs[k]);
+
+      // TODO(xuefeng) set mf_dim while using DownpourCtrDymfAccessor
+      ptr_val[accessor.common_feature_value.MfDimIndex()] = float(mf_dim);
+      val->mf_dim = mf_dim;
 #endif
       if (dim > 8) {  // CpuPS alreay expand as mf_dim
         val->mf_size = mf_dim + 1;
@@ -681,11 +700,15 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
         }
       }
     }
+
     platform::CUDADeviceGuard guard(resource_->dev_id(i));
+
     this->hbm_pools_[i * this->multi_mf_dim_ + j] = new HBMMemoryPool(mem_pool);
     auto& cur_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j];
+
     this->HeterPs_->build_ps(i, device_dim_keys.data(), cur_pool->mem(), len,
                              feature_value_size, 500000, 2);
+
     if (device_dim_keys.size() > 0) {
       VLOG(0) << "show ptr table: " << i
               << " table kv size: " << device_dim_keys.size()
@@ -700,6 +723,7 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
       threads[i + j * device_num] = std::thread(build_dynamic_mf_func, i, j);
     }
   }
+
   for (std::thread& t : threads) {
     t.join();
   }
@@ -723,7 +747,9 @@ void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) {
   InitSlotInfo();
   std::shared_ptr<HeterContext> gpu_task = gpu_task_pool_.Get();
   gpu_task->Reset();
+
   data_ready_channel_->Put(gpu_task);
+
   VLOG(3) << "End LoadIntoMemory(), dataset[" << dataset_ << "]";
 }
 
@@ -805,6 +831,7 @@ void PSGPUWrapper::EndPass() {
   timer.Start();
   size_t keysize_max = 0;
   // in case of feasign_num = 0, skip dump_to_cpu
+
   for (size_t i = 0; i < heter_devices_.size(); i++) {
     for (int j = 0; j < multi_mf_dim_; j++) {
       keysize_max =
@@ -821,11 +848,12 @@ void PSGPUWrapper::EndPass() {
     VLOG(0) << "dump pool to cpu table: " << i << "with mf dim: " << mf_dim;
     size_t feature_value_size =
         TYPEALIGN(8, sizeof(FeatureValue) + ((mf_dim + 1) * sizeof(float)));
+
     char* test_build_values = (char*)malloc(feature_value_size * len);
     cudaMemcpy(test_build_values, hbm_pool->mem(), feature_value_size * len,
                cudaMemcpyDeviceToHost);
+
     CHECK(len == hbm_pool->capacity());
-#ifdef PADDLE_WITH_PSLIB
     uint64_t unuse_key = std::numeric_limits<uint64_t>::max();
     for (size_t i = 0; i < len; ++i) {
       if (device_keys[i] == unuse_key) {
@@ -833,6 +861,7 @@ void PSGPUWrapper::EndPass() {
       }
       size_t offset = i * feature_value_size;
       FeatureValue* gpu_val = (FeatureValue*)(test_build_values + offset);
+#ifdef PADDLE_WITH_PSLIB
       auto* downpour_value =
           (paddle::ps::DownpourFixedFeatureValue*)(gpu_val->cpu_ptr);
       int downpour_value_size = downpour_value->size();
@@ -852,13 +881,32 @@ void PSGPUWrapper::EndPass() {
                   embed_g2sum_index()] = gpu_val->lr_g2sum;
       cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
                   slot_index()] = gpu_val->slot;
+#endif
+#ifdef PADDLE_WITH_PSCORE
+      auto* downpour_value =
+          (paddle::distributed::FixedFeatureValue*)(gpu_val->cpu_ptr);
+      int downpour_value_size = downpour_value->size();
+      if (gpu_val->mf_size > 0 && downpour_value_size == 8) {
+        downpour_value->resize(gpu_val->mf_dim + 1 + downpour_value_size);
+      }
+      float* cpu_val = downpour_value->data();
+
+      paddle::distributed::CtrDymfAccessor accessor;
+      cpu_val[accessor.common_feature_value.DeltaScoreIndex()] =
+          gpu_val->delta_score;
+      cpu_val[accessor.common_feature_value.ShowIndex()] = gpu_val->show;
+      cpu_val[accessor.common_feature_value.ClickIndex()] = gpu_val->clk;
+      cpu_val[accessor.common_feature_value.EmbedWIndex()] = gpu_val->lr;
+      cpu_val[accessor.common_feature_value.EmbedG2SumIndex()] =
+          gpu_val->lr_g2sum;
+      cpu_val[accessor.common_feature_value.SlotIndex()] = gpu_val->slot;
+#endif
       if (gpu_val->mf_size > 0) {
         for (int x = 0; x < gpu_val->mf_dim + 1; x++) {
           cpu_val[x + 8] = gpu_val->mf[x];
         }
       }
     }
-#endif
     free(test_build_values);
   };
   if (multi_mf_dim_) {
@@ -972,7 +1020,6 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
 
   feature_value_size = TYPEALIGN(
       8, sizeof(FeatureValue) + sizeof(float) * (index_dim_vec_.back() + 1));
-  VLOG(0) << "yxf pull sparse feature_value_size: " << feature_value_size;
 
 #ifdef PADDLE_WITH_CUDA
   VLOG(3) << "Begine Gpu Ps PullSparse";
@@ -1159,6 +1206,8 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
         "GPUPS: PushSparseGrad Only Support CUDAPlace Now."));
   }
   all_timer.Pause();
+  time_3 += all_timer.ElapsedSec();
+  time_4 += push_gpups_timer.ElapsedSec();
   VLOG(3) << "PushSparseGrad total cost: " << all_timer.ElapsedSec()
           << " s, of which GPUPS cost: " << push_gpups_timer.ElapsedSec()
           << " s";
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
index 9b55626645942..0efec57e59db6 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
@@ -333,6 +333,11 @@ class PSGPUWrapper {
 
   void SetSlotOffsetVector(const std::vector<int>& slot_offset_vector) {
     slot_offset_vector_ = slot_offset_vector;
+    std::cout << "yxf set: ";
+    for (auto s : slot_offset_vector_) {
+      std::cout << s << " | ";
+    }
+    std::cout << " end " << std::endl;
   }
 
 #ifdef PADDLE_WITH_CUDA
@@ -431,6 +436,12 @@ class PSGPUWrapper {
   int max_mf_dim_{0};
   size_t val_type_size_{0};
   size_t grad_type_size_{0};
+
+  double time_1 = 0.0;
+  double time_2 = 0.0;
+  double time_3 = 0.0;
+  double time_4 = 0.0;
+
   int multi_node_{0};
   int node_size_;
   uint64_t table_id_;
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps b/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps
index 58b9f0f722f8c..ef6c70e624d4c 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps
@@ -28,9 +28,9 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-__global__ void PullCopy(float** dest, const FeatureValue* src,
+__global__ void PullCopy(float* dest, const FeatureValue* src,
                          const long long* len, int hidden, int slot_num,
-                         int total_len, unsigned long long** keys) {
+                         int total_len, unsigned long long* keys) {
   int cid = core_id();
   int ncores = core_num();
   if (cid >= ncores) {
@@ -41,11 +41,21 @@ __global__ void PullCopy(float** dest, const FeatureValue* src,
   __local__ int64_t local_len[slot_num];
   GM2LM(len, local_len, slot_num * sizeof(int64_t));
 
+  __global_ptr__ unsigned long long* local_keys[slot_num];
+  GM2LM(keys, local_keys,
+        slot_num * sizeof(__global_ptr__ unsigned long long*));
+
+  __global_ptr__ float* local_dest[slot_num];
+  GM2LM(dest, local_dest, slot_num * sizeof(__global_ptr__ float*));
+
+  int read_len = 30;
+
   for (int i = thread_id; i < slot_num; i += nthreads) {
     // max core local memory = 8KB
     // slot's max memory size = slot_len * sizeof(FeatureValue)
     int slot_len = i ? local_len[i] - local_len[i - 1] : local_len[0];
-    int read_len = min(roundup_div(1024 * 8, sizeof(FeatureValue)), slot_len);
+    // int read_len = min(roundup_div(1024 * 8, sizeof(FeatureValue)),
+    // slot_len);
     int dest_len = i ? local_len[i - 1] : 0;
     __local__ FeatureValue local_slot_vals[read_len];
     __local__ float local_dest_vals[read_len * hidden];
@@ -56,7 +66,8 @@ __global__ void PullCopy(float** dest, const FeatureValue* src,
       int real_read_len = min(read_len, slot_len - k);
       GM2LM(src + dest_len + k, local_slot_vals,
             real_read_len * sizeof(FeatureValue));
-      GM2LM(keys[i] + k, local_slot_keys, real_read_len * sizeof(uint64_t));
+      GM2LM(local_keys[i] + k, local_slot_keys,
+            real_read_len * sizeof(uint64_t));
       for (int j = 0; j < real_read_len; j++) {
         if (local_slot_keys[j] == 0) {
           local_dest_vals[j * hidden] = 0;
@@ -78,7 +89,7 @@ __global__ void PullCopy(float** dest, const FeatureValue* src,
           }
         }
       }
-      LM2GM(local_dest_vals, dest[i] + k * hidden,
+      LM2GM(local_dest_vals, local_dest[i] + k * hidden,
             real_read_len * hidden * sizeof(float));
     }
   }
@@ -120,7 +131,7 @@ __global__ void CopyKeysKernel(unsigned long long* src_keys,
   }
 }
 
-__global__ void PushCopy(FeaturePushValue* dest, float** src, long long* len,
+__global__ void PushCopy(FeaturePushValue* dest, float* src, long long* len,
                          int hidden, int slot_num, int total_len, int bs,
                          int* slot_vector) {
   int cid = core_id();
@@ -135,12 +146,16 @@ __global__ void PushCopy(FeaturePushValue* dest, float** src, long long* len,
   GM2LM(len, local_len, slot_num * sizeof(int64_t));
   GM2LM(slot_vector, local_slot, slot_num * sizeof(int));
 
+  __global_ptr__ float* local_src[slot_num];
+  GM2LM(src, local_src, slot_num * sizeof(__global_ptr__ float*));
+
   for (int i = thread_id; i < slot_num; i += nthreads) {
     int slot_len = i ? local_len[i] - local_len[i - 1] : local_len[0];
 
     // max core local memory = 8KB
     // slot's max memory size = slot_len * hidden * 8
-    int read_len = min(roundup_div(1024, hidden), slot_len);
+    // int read_len = min(roundup_div(1024, hidden), slot_len);
+    int read_len = 40;
     int dest_len = i ? local_len[i - 1] : 0;
     __local__ float local_slot_grads[read_len * hidden];
     __local__ FeaturePushValue local_dest_grads[read_len];
@@ -148,7 +163,7 @@ __global__ void PushCopy(FeaturePushValue* dest, float** src, long long* len,
     // copy read_len(length) of slots' grad to LM
     for (int k = 0; k < slot_len; k += read_len) {
       int real_read_len = min(read_len, slot_len - k);
-      GM2LM(src[i] + k * hidden, local_slot_grads,
+      GM2LM(local_src[i] + k * hidden, local_slot_grads,
             real_read_len * hidden * sizeof(float));
       // copy from slots' grad to total grad
       for (int j = 0; j < real_read_len; j++) {
@@ -181,14 +196,18 @@ void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place,
   stream = static_cast<platform::XPUDeviceContext*>(dev_ctx)
                ->x_context()
                ->xpu_stream;
-  float* buf_value = nullptr;
-  xpu_malloc(reinterpret_cast<void**>(&buf_value),
+  // float* buf_value = nullptr;
+  // xpu_malloc(reinterpret_cast<void**>(&buf_value),
+  //            values.size() * sizeof(float*));
+  // float** gpu_values = reinterpret_cast<float**>(&buf_value);
+  float* gpu_values = nullptr;
+  xpu_malloc(reinterpret_cast<void**>(&gpu_values),
              values.size() * sizeof(float*));
-  float** gpu_values = reinterpret_cast<float**>(&buf_value);
   xpu_memcpy(gpu_values, values.data(), values.size() * sizeof(float*),
              XPU_HOST_TO_DEVICE);
 
-  unsigned long long** c_keys = (unsigned long long**)gpu_keys;
+  // unsigned long long** c_keys = (unsigned long long**)gpu_keys;
+  unsigned long long* c_keys = reinterpret_cast<unsigned long long*>(gpu_keys);
   const long long* c_len = (const long long*)gpu_len;
   PullCopy<<<2, 64, stream>>>(gpu_values, total_values_gpu, c_len, hidden_size,
                               slot_num, total_length, c_keys);
@@ -230,20 +249,17 @@ void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place,
     slot_lengths_lod[i] += slot_lengths_lod[i - 1];
   }
 
-  float* buf_grad_value = nullptr;
-  int64_t* buf_length = nullptr;
-  int* buf_slot_vector = nullptr;
+  float* gpu_values = nullptr;
+  int64_t* gpu_len = nullptr;
+  int* d_slot_vector = nullptr;
 
-  xpu_malloc(reinterpret_cast<void**>(&buf_grad_value),
+  xpu_malloc(reinterpret_cast<void**>(&gpu_values),
              grad_values.size() * sizeof(float*));
-  xpu_malloc(reinterpret_cast<void**>(&buf_length),
+  xpu_malloc(reinterpret_cast<void**>(&gpu_len),
              slot_lengths.size() * sizeof(int64_t));
-  xpu_malloc(reinterpret_cast<void**>(&buf_slot_vector),
+  xpu_malloc(reinterpret_cast<void**>(&d_slot_vector),
              slot_lengths_lod.size() * sizeof(int));
 
-  float** gpu_values = reinterpret_cast<float**>(&buf_grad_value);
-  int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length);
-  int* d_slot_vector = reinterpret_cast<int*>(buf_slot_vector);
   xpu_memcpy(gpu_values, grad_values.data(),
              grad_values.size() * sizeof(float*), XPU_HOST_TO_DEVICE);
   xpu_memcpy(gpu_len, slot_lengths_lod.data(),
diff --git a/paddle/fluid/framework/generator.cc b/paddle/fluid/framework/generator.cc
index 2bd8ed900f102..b621eca35b893 100644
--- a/paddle/fluid/framework/generator.cc
+++ b/paddle/fluid/framework/generator.cc
@@ -24,7 +24,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-const std::shared_ptr<Generator>& GetDefaultCUDAGenerator(int64_t device_id) {
+const std::shared_ptr<Generator>& DefaultCUDAGenerator(int64_t device_id) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
   static int64_t num_cuda_devices = -1;
@@ -58,8 +58,6 @@ const std::shared_ptr<Generator>& GetDefaultCUDAGenerator(int64_t device_id) {
 const std::shared_ptr<Generator>& DefaultCPUGenerator() {
   static auto default_cpu_generator =
       std::make_shared<Generator>(GetRandomSeed());
-  VLOG(4) << "initial seed: " << default_cpu_generator->GetCurrentSeed()
-          << ", cpu engine: " << default_cpu_generator->GetCPUEngine().get();
   return default_cpu_generator;
 }
 
@@ -100,19 +98,13 @@ const std::shared_ptr<Generator>& GetRandomSeedGenerator(
   return iter->second;
 }
 
-std::shared_ptr<std::mt19937_64> OpDefaultCPUEngine() {
-  static auto op_default_cpu_engine = std::make_shared<std::mt19937_64>();
-  return op_default_cpu_engine;
-}
-
-// NOTE(zhiqiu): there are 3 conditions:
-// (1) op seed is not set and DefaultCPUGenerator is inited, use
-// DefaultCPUGenerator
-// (2) op seed is not set and DefaultCPUGenerator is not inited, use se
-// OpDefaultCPUEngine() and set a radnom seed
-// (3) op seed is set, use OpDefaultCPUEngine() and set the seed
+// There are 3 conditions:
+// (1) op seed is set, use op seed.
+// (2) op seed is not set, global seed is set, use global seed.
+// (3) op seed is not set, global seed is not set too, use random seed from
+// RandomGenerator.
 std::shared_ptr<std::mt19937_64> GetCPURandomEngine(uint64_t seed) {
-  if (DefaultCPUGenerator()->GetIsInitPy() && seed == 0) {
+  if (seed == 0) {
     VLOG(4) << "Use random engine from generator";
     return DefaultCPUGenerator()->GetCPUEngine();
   } else {
@@ -123,12 +115,6 @@ std::shared_ptr<std::mt19937_64> GetCPURandomEngine(uint64_t seed) {
     //
     // And we need to measure the determinacy of Generator in PE.
     auto engine = std::make_shared<std::mt19937_64>();
-    if (seed == 0) {
-      seed = GetRandomSeed();
-      VLOG(4) << "Use default random engine with random seed = " << seed;
-    } else {
-      VLOG(4) << "Use default random engine with fixed random seed = " << seed;
-    }
     static std::mutex mu_;
     {
       std::lock_guard<std::mutex> lock(mu_);
@@ -204,11 +190,5 @@ std::pair<uint64_t, uint64_t> Generator::IncrementOffset(
 #endif
 }
 
-void Generator::SetIsInitPy(bool is_init_py) {
-  this->is_init_py_ = is_init_py;
-  VLOG(4) << "SetIsInitPy:" << this->is_init_py_;
-}
-bool Generator::GetIsInitPy() const { return this->is_init_py_; }
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/generator.h b/paddle/fluid/framework/generator.h
index 1c19234bf7d80..35efc1bee33d5 100644
--- a/paddle/fluid/framework/generator.h
+++ b/paddle/fluid/framework/generator.h
@@ -59,7 +59,6 @@ struct Generator : public phi::Generator {
     this->engine_ = engine;
     VLOG(4) << "initial seed: " << this->state_.current_seed
             << ", cpu engine: " << &this->state_.cpu_engine;
-    this->is_init_py_ = true;  // TODO(zhiqiu): remove it in future
   }
   Generator(uint64_t seed, uint64_t device_id) {
     std::seed_seq seq({seed});
@@ -71,7 +70,6 @@ struct Generator : public phi::Generator {
     this->engine_ = engine;
     VLOG(4) << "initial seed: " << this->state_.current_seed
             << ", cpu engine: " << &this->state_.cpu_engine;
-    this->is_init_py_ = false;  // TODO(zhiqiu): remove it in future
   }
 
   Generator(const Generator& other) = delete;
@@ -95,32 +93,21 @@ struct Generator : public phi::Generator {
 
   std::pair<uint64_t, uint64_t> IncrementOffset(uint64_t increament_offset);
 
-  void SetIsInitPy(bool);
-  bool GetIsInitPy() const;
   uint64_t get_device_id() { return this->state_.device; }
 
  private:
   phi::Generator::GeneratorState state_;
   std::shared_ptr<std::mt19937_64> engine_;
   mutable std::mutex mu_;
-
-  // NOTE(zhiqiu): is_init_py_ is used to make generator be compatible with
-  // old seed, and it should be removed after all random-related operators
-  // and unittests upgrades to use generator.
-  bool is_init_py_ = false;
 };
 
 // The DefaultCPUGenerator is used in manual_seed()
 const std::shared_ptr<Generator>& DefaultCPUGenerator();
 
-// If op seed is set or global is not set, the OpDefaultCPUEngine is used.
-std::shared_ptr<std::mt19937_64> OpDefaultCPUEngine();
+const std::shared_ptr<Generator>& DefaultCUDAGenerator(int64_t device_id = -1);
 
 std::shared_ptr<std::mt19937_64> GetCPURandomEngine(uint64_t);
 
-const std::shared_ptr<Generator>& GetDefaultCUDAGenerator(
-    int64_t device_id = -1);
-
 const std::shared_ptr<Generator>& SetRandomSeedGenerator(
     const std::string& name, uint64_t seed);
 
diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc
index cb33e87f490c2..a7138fd2642a8 100644
--- a/paddle/fluid/framework/hogwild_worker.cc
+++ b/paddle/fluid/framework/hogwild_worker.cc
@@ -219,6 +219,10 @@ void HogwildWorker::TrainFiles() {
   device_reader_->Start();
   int cur_batch;
   int batch_cnt = 0;
+
+#if defined(PADDLE_WITH_HETERPS) && defined(PADDLE_WITH_CUDA)
+  platform::SetDeviceId(thread_id_);
+#endif
   while ((cur_batch = device_reader_->Next()) > 0) {
     for (auto &op : ops_) {
       bool need_skip = false;
@@ -244,9 +248,12 @@ void HogwildWorker::TrainFiles() {
     ++batch_cnt;
     PrintFetchVars();
     thread_scope_->DropKids();
+#ifdef PADDLE_WITH_HETERPS
+    dev_ctx_->Wait();
+#endif
   }
   timeline.Pause();
-  VLOG(3) << "worker " << thread_id_ << " train cost " << timeline.ElapsedSec()
+  VLOG(1) << "worker " << thread_id_ << " train cost " << timeline.ElapsedSec()
           << " seconds, ins_num: " << total_ins_num;
 
   if (need_dump_field_ || need_dump_param_) {
diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc
index 2a8ffbf431ecd..d7901a83b8502 100644
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -349,14 +349,6 @@ const phi::MetaTensor& CompatInferMetaContext::InputAt(size_t idx) const {
   return compat_inputs_.at(idx);
 }
 
-paddle::optional<const phi::MetaTensor&>
-CompatInferMetaContext::OptionalInputAt(size_t idx) const {
-  const auto& input = compat_inputs_.at(idx);
-  return input.initialized()
-             ? paddle::optional<const phi::MetaTensor&>{input}
-             : paddle::optional<const phi::MetaTensor&>{paddle::none};
-}
-
 std::vector<const phi::MetaTensor*> CompatInferMetaContext::InputsBetween(
     size_t start, size_t end) const {
   std::vector<const phi::MetaTensor*> result;
@@ -370,7 +362,7 @@ std::vector<const phi::MetaTensor*> CompatInferMetaContext::InputsBetween(
   return result;
 }
 
-paddle::optional<const std::vector<const phi::MetaTensor*>>
+paddle::optional<std::vector<const phi::MetaTensor*>>
 CompatInferMetaContext::OptionalInputsBetween(size_t start, size_t end) const {
   const auto& first = compat_inputs_.at(start);
 
@@ -383,10 +375,10 @@ CompatInferMetaContext::OptionalInputsBetween(size_t start, size_t end) const {
       result.emplace_back(in.initialized() ? &in : nullptr);
     }
 
-    return paddle::optional<const std::vector<const phi::MetaTensor*>>(result);
+    return paddle::optional<std::vector<const phi::MetaTensor*>>(
+        std::move(result));
   }
-  return paddle::optional<const std::vector<const phi::MetaTensor*>>(
-      paddle::none);
+  return paddle::none;
 }
 
 phi::MetaTensor* CompatInferMetaContext::MutableOutputAt(size_t idx) {
diff --git a/paddle/fluid/framework/infershape_utils.h b/paddle/fluid/framework/infershape_utils.h
index 855e873b30951..04ac1ff59f7ee 100644
--- a/paddle/fluid/framework/infershape_utils.h
+++ b/paddle/fluid/framework/infershape_utils.h
@@ -59,6 +59,12 @@ class CompatMetaTensor : public phi::MetaTensor {
 
   bool initialized() const override { return initialized_; };
 
+  operator unspecified_bool_type() const override {
+    return initialized_ ? unspecified_bool_true : 0;
+  }
+
+  bool operator!() const override { return !initialized_; }
+
  private:
   const LoD& GetRuntimeLoD() const {
     auto* var = BOOST_GET_CONST(Variable*, var_);
@@ -107,13 +113,11 @@ class CompatInferMetaContext : public phi::InferMetaContext {
           outputs);
 
   const phi::MetaTensor& InputAt(size_t idx) const override;
-  paddle::optional<const phi::MetaTensor&> OptionalInputAt(
-      size_t idx) const override;
 
   std::vector<const phi::MetaTensor*> InputsBetween(size_t start,
                                                     size_t end) const override;
-  paddle::optional<const std::vector<const phi::MetaTensor*>>
-  OptionalInputsBetween(size_t start, size_t end) const override;
+  paddle::optional<std::vector<const phi::MetaTensor*>> OptionalInputsBetween(
+      size_t start, size_t end) const override;
 
   phi::MetaTensor* MutableOutputAt(size_t idx) override;
   std::vector<phi::MetaTensor*> MutableOutputBetween(size_t start,
diff --git a/paddle/fluid/framework/io/shell.cc b/paddle/fluid/framework/io/shell.cc
index f01894f2cf448..361153de7d73a 100644
--- a/paddle/fluid/framework/io/shell.cc
+++ b/paddle/fluid/framework/io/shell.cc
@@ -69,7 +69,7 @@ static int close_open_fds_internal() {
 
   for (;;) {
     int bytes = 0;
-    if ((bytes = syscall(SYS_getdents, dir_fd,
+    if ((bytes = syscall(SYS_getdents64, dir_fd,
                          reinterpret_cast<linux_dirent*>(buffer),
                          sizeof(buffer))) < 0) {
       PADDLE_THROW(platform::errors::Unavailable(
diff --git a/paddle/fluid/framework/ir/delete_fill_constant_op_pass.cc b/paddle/fluid/framework/ir/delete_fill_constant_op_pass.cc
index e86bb2926b640..79a06572d1427 100644
--- a/paddle/fluid/framework/ir/delete_fill_constant_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_fill_constant_op_pass.cc
@@ -30,13 +30,19 @@ void FillConstData(LoDTensor* out_t, T value) {
 void DeleteFillConstantOpPass::ApplyImpl(ir::Graph* graph) const {
   FusePassBase::Init("delete_fill_constant_op_pass", graph);
   GraphPatternDetector detector;
-  auto fill_constant_op = detector.mutable_pattern()
-                              ->NewNode("fill_constant")
-                              ->assert_is_op("fill_constant")
-                              ->assert_is_not_op_input("ValueTensor")
-                              ->assert_is_not_op_input("str_value")
-                              ->assert_is_not_op_input("ShapeTensor")
-                              ->assert_is_not_op_input("ShapeTensorList");
+  auto fill_constant_op =
+      detector.mutable_pattern()
+          ->NewNode("fill_constant")
+          ->assert_is_op("fill_constant")
+          ->assert_is_not_op_input("ValueTensor")
+          ->assert_is_not_op_input("str_value")
+          ->assert_is_not_op_input("ShapeTensor")
+          ->assert_is_not_op_input("ShapeTensorList")
+          ->assert_more([&](Node* node) {
+            return node->Op()
+                       ->GetAttrIfExists<std::vector<int64_t>>("shape")
+                       .size() == 1;
+          });
   auto fill_constant_out =
       detector.mutable_pattern()
           ->NewNode("fill_constant_out")
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index cab8f82660d90..3c6b6ce94e23f 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -1057,7 +1057,7 @@ struct Pool : public PatternBase {
 };
 
 // Elementwise ops
-// Forward pass for element-wise operators (add, mul)
+// Forward pass for element-wise operators
 // elementwise_out is the result of the operator
 struct Elementwise : public PatternBase {
   Elementwise(PDPattern* pattern, const std::string& name_scope)
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index 4aae60b853d4f..a61c043b58065 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -1188,6 +1188,7 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
   QuantizeMatmul(graph);
   QuantizeElementwise(graph, "elementwise_add");
   QuantizeElementwise(graph, "elementwise_mul");
+  QuantizeElementwise(graph, "elementwise_sub");
   QuantizeFusionGru(graph);
   QuantizeMultiGru(graph);
   QuantizeFusionLSTM(graph);
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
index 22000865948d6..912c16288c2b9 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
@@ -90,7 +90,8 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
     op->SetAttr("Scale_x", 1.0f);
     op->SetAttr("Scale_y", 1.0f);
     op->SetAttr("Scale_out", 1.0f);
-  } else if (type == "elementwise_add" || type == "elementwise_mul") {
+  } else if (type == "elementwise_add" || type == "elementwise_mul" ||
+             type == "elementwise_sub") {
     op->SetInput("X", {inputs[0]});
     if (inputs.size() > 1) op->SetInput("Y", {inputs[1]});
     op->SetOutput("Out", {outputs[0]});
@@ -168,7 +169,7 @@ void CheckScales(const OpDesc* op, float scale, float shift) {
     scale_names.push_back("Scale_in");
     scale_names.push_back("Scale_out");
   } else if (type == "matmul" || type == "elementwise_add" ||
-             type == "elementwise_mul") {
+             type == "elementwise_mul" || type == "elementwise_sub") {
     scale_names.push_back("Scale_x");
     scale_names.push_back("Scale_y");
     scale_names.push_back("Scale_out");
@@ -565,60 +566,59 @@ ProgramDesc BuildProgramDescElementwise(const std::string elementwise_type,
   return prog;
 }
 
-void TestElementwise(const std::string elementwise_type,
-                     const std::string elementwise_name) {
+void TestElementwise(std::vector<std::string> elementwise) {
   // 2 Quant + 2 IN + 1 DeQuant + 1 OUT
   int added_nodes = 6;
   std::unordered_map<std::string, int> expected_operators = {
-      {elementwise_type, 1}, {"quantize", 2}, {"dequantize", 3}};
-  MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name),
+      {elementwise[0], 1}, {"quantize", 2}, {"dequantize", 3}};
+  MainTest(BuildProgramDescElementwise(elementwise[0], elementwise[1]),
            variable_names_elementwise, expected_operators, added_nodes,
            SCALE * S8_MAX);
 }
 
-void TestElementwiseOutputScaleMissing(const std::string elementwise_type,
-                                       const std::string elementwise_name) {
+void TestElementwiseOutputScaleMissing(std::vector<std::string> elementwise) {
   int added_nodes = 0;
   std::unordered_map<std::string, int> expected_operators = {
-      {elementwise_type, 1}, {"quantize", 0}, {"dequantize", 2}};
-  MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name),
+      {elementwise[0], 1}, {"quantize", 0}, {"dequantize", 2}};
+  MainTest(BuildProgramDescElementwise(elementwise[0], elementwise[1]),
            variable_names_elementwise, expected_operators, added_nodes, 1.f,
            1.f, "e");
 }
 
-void TestElementwiseUnsignedAndSignedInput(const std::string elementwise_type,
-                                           const std::string elementwise_name) {
+void TestElementwiseUnsignedAndSignedInput(
+    std::vector<std::string> elementwise) {
   int added_nodes = 0;
   std::unordered_map<std::string, int> expected_operators = {
-      {elementwise_type, 1}, {"quantize", 0}, {"dequantize", 2}};
-  MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name),
+      {elementwise[0], 1}, {"quantize", 0}, {"dequantize", 2}};
+  MainTest(BuildProgramDescElementwise(elementwise[0], elementwise[1]),
            variable_names_elementwise, expected_operators, added_nodes, 1.f,
            1.f, "", "b");
 }
 
-TEST(CpuQuantizePass, elementwise_add) {
-  TestElementwise("elementwise_add", "ElementwiseAdd");
-}
+const std::vector<std::vector<std::string>> elementwises = {
+    {"elementwise_add", "ElementwiseAdd"},
+    {"elementwise_mul", "ElementwiseMul"},
+    {"elementwise_sub", "ElementwiseSub"}};
 
-TEST(CpuQuantizePass, elementwise_add_output_scale_missing) {
-  TestElementwiseOutputScaleMissing("elementwise_add", "ElementwiseAdd");
-}
+class TestElementwises
+    : public testing::TestWithParam<std::vector<std::string>> {};
 
-TEST(CpuQuantizePass, elementwise_add_unsigned_and_signed_input) {
-  TestElementwiseUnsignedAndSignedInput("elementwise_add", "ElementwiseAdd");
-}
+TEST_P(TestElementwises, elementwise_basic) { TestElementwise(GetParam()); }
 
-TEST(CpuQuantizePass, elementwise_mul) {
-  TestElementwise("elementwise_mul", "ElementwiseMul");
+TEST_P(TestElementwises, elementwise_output_scale_missing) {
+  TestElementwiseOutputScaleMissing(GetParam());
 }
 
-TEST(CpuQuantizePass, elementwise_mul_output_scale_missing) {
-  TestElementwiseOutputScaleMissing("elementwise_mul", "ElementwiseMul");
+TEST_P(TestElementwises, elementwise_unsigned_and_signed_input) {
+  TestElementwiseUnsignedAndSignedInput(GetParam());
 }
 
-TEST(CpuQuantizePass, elementwise_mul_unsigned_and_signed_input) {
-  TestElementwiseUnsignedAndSignedInput("elementwise_mul", "ElementwiseMul");
-}
+INSTANTIATE_TEST_CASE_P(
+    Elementwises, TestElementwises, testing::ValuesIn(elementwises),
+    [](const ::testing::TestParamInfo<TestElementwises::ParamType>& info) {
+      std::string name = info.param[0];
+      return name;
+    });
 
 const std::vector<std::string> churn_out_vars(ProgramDesc* prog,
                                               const std::string& prefix,
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
index 3b883dac9782a..5b606a89ac90a 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
@@ -27,9 +27,10 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const {
   std::unordered_set<std::string> supported_op_types =
       std::unordered_set<std::string>(
           {"concat", "conv2d", "depthwise_conv2d", "elementwise_add",
-           "elementwise_mul", "fc", "matmul", "nearest_interp",
-           "nearest_interp_v2", "pool2d", "prior_box", "reshape2", "transpose2",
-           "fusion_gru", "fusion_lstm", "multi_gru", "slice"});
+           "elementwise_mul", "elementwise_sub", "fc", "matmul",
+           "nearest_interp", "nearest_interp_v2", "pool2d", "prior_box",
+           "reshape2", "transpose2", "fusion_gru", "fusion_lstm", "multi_gru",
+           "slice"});
   const auto& excluded_ids_list =
       Get<std::unordered_set<int>>("quantize_excluded_op_ids");
   const auto& op_types_list =
diff --git a/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.h
index 9233650a2db3c..383c4f40fc03d 100644
--- a/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.h
@@ -14,10 +14,6 @@
 
 #pragma once
 
-// #include <memory>
-// #include <string>
-// #include <unordered_map>
-
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
index 55470db312f81..63e402cb52983 100644
--- a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
@@ -489,14 +489,6 @@ void QuantDequantMkldnnPass::UpdateActivations(ir::Graph* graph) const {
         std::string activation;
         if (op_desc->GetAttrIfExists<bool>("fuse_relu")) {
           activation = "relu";
-        } else if (op_desc->GetAttrIfExists<bool>("fuse_brelu")) {
-          activation = "relu6";
-          float alpha = 6.0;
-          if (op_desc->HasAttr("fuse_brelu_threshold")) {
-            alpha = BOOST_GET_CONST(float,
-                                    op_desc->GetAttr("fuse_brelu_threshold"));
-          }
-          op_node->Op()->SetAttr("fuse_alpha", alpha);
         }
         op_node->Op()->SetAttr("fuse_activation", activation);
       }
diff --git a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc
index 0fc458723ffe4..60d661f7740d0 100644
--- a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc
@@ -91,6 +91,10 @@ void ScaleMatmulFusePass::ApplyImpl(ir::Graph* graph) const {
     GET_IR_NODE_FROM_SUBGRAPH(scale_out, scale_out, scale_matmul_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul_op, scale_matmul_pattern);
 
+    if ((scale_out->outputs).size() != 1) {
+      return;
+    }
+
     if (scale_op->Op()->GetAttrIfExists<float>("bias") == 0.0) {
       auto matmul_alpha = matmul_op->Op()->GetAttrIfExists<float>("alpha");
       auto scale_scale = scale_op->Op()->GetAttrIfExists<float>("scale");
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
index a8595d55b31b0..4a5947778056a 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
@@ -864,7 +864,7 @@ int MultiHeadMatmulV2FusePass::BuildFusionV2(Graph* graph,
     auto* mul0_op_desc = mul0->Op();
 
     // all mul op has same input.
-    if (multihead_op_desc.HasAttr("Input_scale")) {
+    if (mul0_op_desc->HasAttr("Input_scale")) {
       multihead_op_desc.SetAttr("Input_scale",
                                 mul0_op_desc->GetAttr("Input_scale"));
     }
diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
index 281e0b9910619..e436bee035cea 100644
--- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
@@ -488,7 +488,7 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph, Scope* scope,
     // Convert weight to fp32 range
     auto* weight_tensor =
         scope->Var(quantized_op_weight_node->Name())->GetMutable<LoDTensor>();
-    auto w_dims = weight_tensor->dims();
+    const auto& w_dims = weight_tensor->dims();
     float* quantized_weight_data =
         weight_tensor->mutable_data<float>(platform::CPUPlace());
     // If quantized op is fc, weight scale size = 1;
diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
index f3d96c3850656..bda6b90386475 100644
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
@@ -93,7 +93,7 @@ void TransposeFlattenConcatFusePass::RunTransposeFlattenConcatFuse(
 
     std::vector<Node *> nodes;
     std::vector<int> trans_axis0;
-    int flatten_axis0;
+    int flatten_axis0 = 0;
     for (int i = 0; i < times; i++) {
       PADDLE_ENFORCE_NOT_NULL(
           subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i))),
diff --git a/paddle/fluid/framework/ir/yolo_box_fuse_pass.cc b/paddle/fluid/framework/ir/yolo_box_fuse_pass.cc
index c974d334a8de0..20075a49749f7 100644
--- a/paddle/fluid/framework/ir/yolo_box_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/yolo_box_fuse_pass.cc
@@ -199,9 +199,11 @@ void YoloBoxFusePass::ApplyImpl(ir::Graph* graph) const {
     GET_IR_NODE(nms_out_rois_num);
 #undef GET_IR_NODE
 
+    auto* block = yolo_box0->Op()->Block();
+
 // create yolo_box_head
 #define CREATE_YOLO_BOX_HEAD(idx_)                                         \
-  framework::OpDesc yolo_box_head##idx_##_op_desc;                         \
+  framework::OpDesc yolo_box_head##idx_##_op_desc(block);                  \
   yolo_box_head##idx_##_op_desc.SetType("yolo_box_head");                  \
   yolo_box_head##idx_##_op_desc.SetInput("X",                              \
                                          {yolo_box##idx_##_in_x->Name()}); \
@@ -222,7 +224,7 @@ void YoloBoxFusePass::ApplyImpl(ir::Graph* graph) const {
 #undef CREATE_YOLO_BOX_HEAD
 
     // create yolo_box_post
-    framework::OpDesc yolo_box_post_op_desc;
+    framework::OpDesc yolo_box_post_op_desc(block);
     yolo_box_post_op_desc.SetType("yolo_box_post");
     yolo_box_post_op_desc.SetInput("Boxes0", {yolo_box0_out_boxes->Name()});
     yolo_box_post_op_desc.SetInput("Boxes1", {yolo_box1_out_boxes->Name()});
diff --git a/paddle/fluid/framework/multi_trainer.cc b/paddle/fluid/framework/multi_trainer.cc
index 7a83fdccc218c..6479f7ae72654 100644
--- a/paddle/fluid/framework/multi_trainer.cc
+++ b/paddle/fluid/framework/multi_trainer.cc
@@ -148,6 +148,17 @@ void MultiTrainer::InitTrainerEnv(const ProgramDesc& main_program,
     }
   }
 #endif
+  for (auto& var : main_program.Block(0).AllVars()) {
+    if (var->Persistable()) {
+      auto it = std::find(need_merge_var_names_.begin(),
+                          need_merge_var_names_.end(), var->Name());
+      if (it == need_merge_var_names_.end() &&
+          var->GetType() != proto::VarType::SELECTED_ROWS) {
+        VLOG(2) << "train param: " << var->Name();
+        trainable_param_.push_back(var->Name());
+      }
+    }
+  }
 }
 
 void MultiTrainer::InitOtherEnv(const ProgramDesc& main_program) {
@@ -192,18 +203,30 @@ void MultiTrainer::Run() {
 
 #ifdef PADDLE_WITH_HETERPS
 void MultiTrainer::MergeDenseParam() {
-#ifdef PADDLE_WTIH_PSCORE
+#ifdef PADDLE_WITH_PSCORE
   auto communicator = paddle::distributed::Communicator::GetInstance();
-  auto& recv_ctx = communicator->GetRecvCtxMap();
-  Scope* thread_scope = workers_[0]->GetThreadScope();
-  for (auto& iter : recv_ctx) {
-    auto& varnames = iter.second;
-    for (auto& name : varnames) {
+  auto thread_scope = workers_[0]->GetThreadScope();
+  if (communicator == nullptr) {
+    for (auto& name : trainable_param_) {
+      VLOG(2) << "merge var " << name << " to root scope";
       Variable* root_var = root_scope_->FindVar(name);
       LoDTensor* root_tensor = root_var->GetMutable<LoDTensor>();
       Variable* var = thread_scope->FindVar(name);
       LoDTensor* tensor = var->GetMutable<LoDTensor>();
-      TensorCopy((*tensor), root_tensor->place(), root_tensor);
+      TensorCopySync((*tensor), root_tensor->place(), root_tensor);
+    }
+  } else {
+    auto& recv_ctx = communicator->GetRecvCtxMap();
+    for (auto& iter : recv_ctx) {
+      auto& varnames = iter.second;
+      for (auto& name : varnames) {
+        VLOG(2) << "merge var " << name << " to root scope";
+        Variable* root_var = root_scope_->FindVar(name);
+        LoDTensor* root_tensor = root_var->GetMutable<LoDTensor>();
+        Variable* var = thread_scope->FindVar(name);
+        LoDTensor* tensor = var->GetMutable<LoDTensor>();
+        TensorCopySync((*tensor), root_tensor->place(), root_tensor);
+      }
     }
   }
 #endif
@@ -236,11 +259,7 @@ void MultiTrainer::Finalize() {
     }
     LoDTensor* root_tensor = root_var->GetMutable<LoDTensor>();
 
-#ifdef PADDLE_WITH_HETERPS
-    for (size_t j = 0; j < places_.size(); j++) {
-#else
     for (int j = 1; j < thread_num_; j++) {
-#endif
       Scope* cur_thread_scope = workers_[j]->GetThreadScope();
       Variable* thread_var =
           cur_thread_scope->FindVar(need_merge_var_names_[i]);
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 18287f0c7a4ee..69f14d7903c0b 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1116,6 +1116,21 @@ class RuntimeInferShapeContext : public InferShapeContext {
   const RuntimeContext& ctx_;
 };
 
+struct OperatorWithKernel::CacheImpl {
+  explicit CacheImpl(phi::KernelContext* kernel_ctx,
+                     RuntimeInferShapeContext* infer_shape_ctx)
+      : kernel_ctx_(kernel_ctx), infer_shape_ctx_(infer_shape_ctx) {}
+
+  phi::KernelContext* getKernelContext() { return kernel_ctx_.get(); }
+  RuntimeInferShapeContext* getRuntimeInferShapeContext() {
+    return infer_shape_ctx_.get();
+  }
+
+ private:
+  std::unique_ptr<phi::KernelContext> kernel_ctx_;
+  std::unique_ptr<RuntimeInferShapeContext> infer_shape_ctx_;
+};
+
 static void CheckTensorNANOrInf(const std::string& op_type,
                                 const std::string& name,
                                 const framework::Tensor& tensor) {
@@ -1908,7 +1923,8 @@ Scope* OperatorWithKernel::PrepareData(
             (var->IsType<LoDTensor>() == true) &&
             (expected_kernel_key.data_layout_ != DataLayout::kMKLDNN) &&
             (paddle::platform::MKLDNNDeviceContext::tls()
-                 .get_cur_paddle_data_layout() == DataLayout::kNHWC)) {
+                 .get_cur_paddle_data_layout() == DataLayout::kNHWC) &&
+            (tensor_in->dims().size() >= 3)) {
           // Mixed execution : MKL-DNN and GPU is not supported!
           if (!new_scope) {
             new_scope = &scope.NewScope();
@@ -2322,6 +2338,8 @@ Scope* OperatorWithKernel::PreparePhiData(
       Tensor out;
       framework::TensorCopySync(*tensor_in, expected_place, &out);
       SetTensorToVariable(*var, out, trans_var);
+
+      need_prepare_phi_data_ = true;
     }
   }
 
@@ -2369,15 +2387,12 @@ void OperatorWithKernel::BuildPhiKernelContext(
     // deal with optional here
     if ((it == ctx.inputs.end() || it->second.size() == 0) &&
         (input_defs[i].type_index ==
-             std::type_index(
-                 typeid(paddle::optional<const phi::DenseTensor&>)) ||
+             std::type_index(typeid(paddle::optional<phi::DenseTensor>)) ||
          input_defs[i].type_index ==
-             std::type_index(
-                 typeid(paddle::optional<const phi::SelectedRows&>)) ||
+             std::type_index(typeid(paddle::optional<phi::SelectedRows>)) ||
          input_defs[i].type_index ==
-             std::type_index(
-                 typeid(paddle::optional<
-                        const std::vector<const phi::DenseTensor*>>)))) {
+             std::type_index(typeid(
+                 paddle::optional<std::vector<const phi::DenseTensor*>>)))) {
       pt_kernel_context->EmplaceBackInputWithoutSetRange(nullptr);
       auto end_idx = start_idx + 1;
       pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx),
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 2e00e07535b1d..2efa2e4bd8a75 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -698,6 +698,7 @@ class OperatorWithKernel : public OperatorBase {
   mutable std::unique_ptr<RuntimeContext> runtime_ctx_;
   mutable const Scope* pre_scope_ = nullptr;
   mutable bool need_prepare_data_ = true;
+  mutable bool need_prepare_phi_data_ = false;
   mutable bool enable_cache_runtime_context_ = false;
   mutable bool all_kernels_must_compute_runtime_shape_ = false;
   mutable std::mutex cache_update_mutex_;
@@ -710,6 +711,9 @@ class OperatorWithKernel : public OperatorBase {
   mutable std::unique_ptr<phi::KernelSignature> kernel_signature_;
   mutable std::unique_ptr<phi::Kernel> pt_kernel_;
   mutable std::unique_ptr<phi::ArgumentMappingFn> arg_map_fn_;
+
+  struct CacheImpl;
+  mutable CacheImpl* impl_{nullptr};
 };
 
 extern bool OpSupportGPU(const std::string& op_type);
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index b86b4fec8a571..c78f7611b63be 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -129,6 +129,7 @@ class MultiTrainer : public TrainerBase {
   std::vector<DataFeed*> readers_;
   std::vector<std::shared_ptr<DeviceWorker>> workers_;
   std::vector<std::string> need_merge_var_names_;
+  std::vector<std::string> trainable_param_;
 #ifdef PADDLE_WITH_HETERPS
   std::vector<platform::Place> places_;
 #endif
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index 129f75e75de1e..ccc8d64517f95 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -279,16 +279,14 @@ void BuildDygraphPhiKernelContext(const phi::KernelSignature& kernel_signature,
 
     if (it == ins.end()) {
       if (LIKELY(input_defs[i].type_index ==
-                 std::type_index(
-                     typeid(paddle::optional<const phi::DenseTensor&>)))) {
+                 std::type_index(typeid(paddle::optional<phi::DenseTensor>)))) {
         kernel_ctx->EmplaceBackInputWithoutSetRange(nullptr);
         auto end_idx = start_idx + 1;
         kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
         continue;
       } else if (input_defs[i].type_index ==
-                 std::type_index(
-                     typeid(paddle::optional<
-                            const std::vector<const phi::DenseTensor*>>))) {
+                 std::type_index(typeid(
+                     paddle::optional<std::vector<const phi::DenseTensor*>>))) {
         kernel_ctx->EmplaceBackInputWithoutSetRange(nullptr);
         auto end_idx = start_idx + 1;
         kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index b2d8afaa7b49c..aafbe57e05ff2 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -273,6 +273,11 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) {
     if (pass->Type() != "graph_viz_pass" && !disable_logs_) {
       PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass->Type());
     }
+    // delete_fill_constant_op_pass is not apply under trt dynamic shape
+    if (pass->Type() == "delete_fill_constant_op_pass") {
+      bool use_dynamic = pass->Get<bool>("with_dynamic_shape");
+      if (use_dynamic) continue;
+    }
     graph.reset(pass->Apply(graph.release()));
   }
   return graph;
diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
index a8c29579e12e7..083fc8991192e 100644
--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
@@ -268,7 +268,7 @@ void LiteSubgraphPass::SetUpEngine(
   auto nnadapter_model_cache_token =
       Get<std::vector<std::string>>("nnadapter_model_cache_token");
 
-  lite_api::TargetType target_type;
+  lite_api::TargetType target_type = TARGET(kX86);
   if (use_gpu) {
     target_type = TARGET(kCUDA);
   } else if (use_xpu) {
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index adc3fc46f72ac..735e1b7be4c1f 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -633,11 +633,6 @@ void AnalysisConfig::Update() {
           (pass == "conv_bn_fuse_pass")) {
         continue;
       }
-      // delete_fill_constant_op_pass is not used under trt dynamic shape
-      if ((!min_input_shape_.empty() || trt_tuned_dynamic_shape_) &&
-          pass == "delete_fill_constant_op_pass") {
-        continue;
-      }
       pass_builder()->AppendPass(pass);
     }
   }
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index ecb5eaf982548..e8a1384166aff 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -522,7 +522,7 @@ TEST(Tensor, GpuShareExternalData) {
 
   auto out = predictor->GetOutputHandle("fc_1.tmp_2");
   auto out_shape = out->shape();
-  float* out_data;
+  float* out_data = nullptr;
   auto out_size = std::accumulate(out_shape.begin(), out_shape.end(), 1,
                                   std::multiplies<int>()) *
                   sizeof(float);
diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
index 21c79f0edd27f..4b4ad01f5674a 100644
--- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
@@ -56,7 +56,7 @@ class MultiheadMatMulOpConverter : public OpConverter {
            weight_t->numel() * sizeof(float));
 
     // (hidden_in, 3, hidden_out)
-    auto weight_dims = weight_t->dims();
+    const auto& weight_dims = weight_t->dims();
 
     int hidden_in = weight_dims[0];   // channels_in
     int three = weight_dims[1];       // channels_out
diff --git a/paddle/fluid/inference/tensorrt/convert/reduce_op.cc b/paddle/fluid/inference/tensorrt/convert/reduce_op.cc
index 7c5eaa309ef18..13886f55dff01 100644
--- a/paddle/fluid/inference/tensorrt/convert/reduce_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/reduce_op.cc
@@ -41,7 +41,7 @@ class ReduceOpConverter : public OpConverter {
                   const framework::Scope& scope, bool test_mode) override {
     VLOG(4) << "convert a paddle " << op_type << " op to tensorrt reduce layer";
     framework::OpDesc op_desc(op, nullptr);
-    nvinfer1::ReduceOperation reduce_type;
+    nvinfer1::ReduceOperation reduce_type = nvinfer1::ReduceOperation::kSUM;
     if (op_type == "reduce_sum") {
       reduce_type = nvinfer1::ReduceOperation::kSUM;
     } else if (op_type == "reduce_mean") {
diff --git a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu
index 4e6b82d2dc146..0a6d24f90722e 100644
--- a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu
@@ -56,8 +56,6 @@ SlicePlugin::SlicePlugin(std::vector<int> starts, std::vector<int> ends,
                          std::vector<int> axes, bool with_fp16)
     : starts_(starts), ends_(ends), axes_(axes) {
   with_fp16_ = with_fp16;
-  cudaEventCreate(&copy_event_);
-  cudaStreamCreate(&copy_stream_);
 }
 
 SlicePlugin::SlicePlugin(void const *serial_data, size_t serial_length) {
@@ -66,15 +64,10 @@ SlicePlugin::SlicePlugin(void const *serial_data, size_t serial_length) {
   DeserializeValue(&serial_data, &serial_length, &ends_);
   DeserializeValue(&serial_data, &serial_length, &axes_);
   DeserializeValue(&serial_data, &serial_length, &with_fp16_);
-  cudaEventCreate(&copy_event_);
-  cudaStreamCreate(&copy_stream_);
+  DeserializeValue(&serial_data, &serial_length, &offset_info_);
 }
 
-SlicePlugin::~SlicePlugin() {
-  cudaStreamDestroy(copy_stream_);
-  cudaEventDestroy(copy_event_);
-  cudaFree(offset_temp_data_);
-}
+SlicePlugin::~SlicePlugin() { cudaFree(offset_temp_data_); }
 
 SlicePlugin *SlicePlugin::clone() const TRT_NOEXCEPT {
   return new SlicePlugin(starts_, ends_, axes_, with_fp16_);
@@ -159,11 +152,7 @@ int SlicePlugin::enqueue(int batch_size, const void *const *inputs,
   }
 
   cudaMemcpyAsync(offset_temp_data_, offset_info.data(),
-                  sizeof(int) * 3 * num_dims, cudaMemcpyHostToDevice,
-                  copy_stream_);
-
-  cudaEventRecord(copy_event_, copy_stream_);
-  cudaStreamWaitEvent(stream, copy_event_, 0);
+                  sizeof(int) * 3 * num_dims, cudaMemcpyHostToDevice, stream);
 
   int threads = 256;
   int blocks = (out_num + threads - 1) / threads;
@@ -190,7 +179,7 @@ int SlicePlugin::enqueue(int batch_size, const void *const *inputs,
 size_t SlicePlugin::getSerializationSize() const TRT_NOEXCEPT {
   return getBaseSerializationSize() + SerializedSize(starts_) +
          SerializedSize(ends_) + SerializedSize(axes_) +
-         SerializedSize(with_fp16_);
+         SerializedSize(with_fp16_) + SerializedSize(offset_info_);
 }
 
 void SlicePlugin::serialize(void *buffer) const TRT_NOEXCEPT {
@@ -199,6 +188,7 @@ void SlicePlugin::serialize(void *buffer) const TRT_NOEXCEPT {
   SerializeValue(&buffer, ends_);
   SerializeValue(&buffer, axes_);
   SerializeValue(&buffer, with_fp16_);
+  SerializeValue(&buffer, offset_info_);
 }
 
 // Dynamic Plugin below.
@@ -209,8 +199,6 @@ SlicePluginDynamic::SlicePluginDynamic(std::vector<int> starts,
                                        bool with_fp16)
     : starts_(starts), ends_(ends), axes_(axes), decrease_axis_(decrease_axis) {
   with_fp16_ = with_fp16;
-  cudaEventCreate(&copy_event_);
-  cudaStreamCreate(&copy_stream_);
 }
 
 SlicePluginDynamic::SlicePluginDynamic(void const *serialData,
@@ -220,13 +208,10 @@ SlicePluginDynamic::SlicePluginDynamic(void const *serialData,
   DeserializeValue(&serialData, &serialLength, &axes_);
   DeserializeValue(&serialData, &serialLength, &decrease_axis_);
   DeserializeValue(&serialData, &serialLength, &with_fp16_);
-  cudaEventCreate(&copy_event_);
-  cudaStreamCreate(&copy_stream_);
+  DeserializeValue(&serialData, &serialLength, &offset_info_);
 }
 
 void SlicePluginDynamic::destroy() TRT_NOEXCEPT {
-  cudaStreamDestroy(copy_stream_);
-  cudaEventDestroy(copy_event_);
   cudaFree(offset_temp_data_);
   delete this;
 }
@@ -236,7 +221,7 @@ int SlicePluginDynamic::initialize() TRT_NOEXCEPT { return 0; }
 size_t SlicePluginDynamic::getSerializationSize() const TRT_NOEXCEPT {
   size_t size = SerializedSize(starts_) + SerializedSize(ends_) +
                 SerializedSize(axes_) + SerializedSize(decrease_axis_) +
-                SerializedSize(with_fp16_);
+                SerializedSize(with_fp16_) + SerializedSize(offset_info_);
 
   return size;
 }
@@ -247,6 +232,7 @@ void SlicePluginDynamic::serialize(void *buffer) const TRT_NOEXCEPT {
   SerializeValue(&buffer, axes_);
   SerializeValue(&buffer, decrease_axis_);
   SerializeValue(&buffer, with_fp16_);
+  SerializeValue(&buffer, offset_info_);
 }
 
 nvinfer1::DimsExprs SlicePluginDynamic::getOutputDimensions(
@@ -361,23 +347,19 @@ int SlicePluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
     offsets[axes_[i]] = starts_[i];
   }
 
-  std::vector<int> offset_info;
+  offset_info_.resize(num_dims * 3);
   for (size_t i = 0; i < num_dims; ++i) {
-    offset_info.push_back(offsets[i]);
-    offset_info.push_back(extends[i]);
-    offset_info.push_back(seg_offsets[i]);
+    offset_info_[i * 3 + 0] = offsets[i];
+    offset_info_[i * 3 + 1] = extends[i];
+    offset_info_[i * 3 + 2] = seg_offsets[i];
   }
 
   if (offset_temp_data_ == nullptr) {
     cudaMalloc(&offset_temp_data_, 3 * num_dims * sizeof(int));
   }
 
-  cudaMemcpyAsync(offset_temp_data_, offset_info.data(),
-                  sizeof(int) * 3 * num_dims, cudaMemcpyHostToDevice,
-                  copy_stream_);
-
-  cudaEventRecord(copy_event_, copy_stream_);
-  cudaStreamWaitEvent(stream, copy_event_, 0);
+  cudaMemcpyAsync(offset_temp_data_, offset_info_.data(),
+                  sizeof(int) * 3 * num_dims, cudaMemcpyHostToDevice, stream);
 
   int threads = 256;
   int blocks = (out_num + threads - 1) / threads;
diff --git a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h
index 4c07f0be36864..6b50a52df1fe5 100644
--- a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h
@@ -64,8 +64,7 @@ class SlicePlugin : public PluginTensorRT {
   std::vector<int> ends_;
   std::vector<int> axes_;
   int* offset_temp_data_{nullptr};
-  cudaEvent_t copy_event_;
-  cudaStream_t copy_stream_;
+  std::vector<int> offset_info_;
 };
 
 class SlicePluginCreator : public TensorRTPluginCreator {
@@ -144,8 +143,7 @@ class SlicePluginDynamic : public DynamicPluginTensorRT {
   std::vector<int> axes_;
   int decrease_axis_;
   int* offset_temp_data_{nullptr};
-  cudaEvent_t copy_event_;
-  cudaStream_t copy_stream_;
+  std::vector<int> offset_info_;
 };
 
 class SlicePluginDynamicCreator : public TensorRTPluginCreator {
diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt
index 76bb8993cbefa..53e7993945586 100644
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
@@ -13,6 +13,7 @@ cc_library(memcpy SRCS memcpy.cc DEPS place device_context)
 cc_library(stats SRCS stats.cc DEPS enforce)
 cc_library(memory DEPS malloc memcpy stats)
 
+cc_test(memory_stats_test SRCS memory_stats_test.cc DEPS memory)
 cc_test(stats_test SRCS stats_test.cc DEPS stats)
 
 if (WITH_GPU)
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 35ad27f4c62b5..46e1a500e4870 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -931,8 +931,11 @@ class AllocatorFacadePrivate {
 
   void WrapStatAllocator() {
     for (auto& pair : allocators_) {
-      // Now memory stats is only supported for GPU
-      if (platform::is_gpu_place(pair.first)) {
+      // Now memory stats is only supported for CPU and GPU
+      const platform::Place& place = pair.first;
+      if (platform::is_cpu_place(place) ||
+          platform::is_cuda_pinned_place(place) ||
+          platform::is_gpu_place(place)) {
         pair.second = std::make_shared<StatAllocator>(pair.second);
       }
     }
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc
index 276c6bb0e69b8..5e5aea6dab2cc 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/pinned_allocator.h"
-
+#include "paddle/fluid/memory/stats.h"
 namespace paddle {
 namespace memory {
 namespace allocation {
@@ -24,6 +24,7 @@ void CPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) {
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeHost(allocation->ptr()));
 #endif
+  HOST_MEMORY_STAT_UPDATE(Reserved, 0, -allocation->size());
   delete allocation;
 }
 phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
@@ -33,6 +34,7 @@ phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable));
 #endif
+  HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
   return new Allocation(ptr, size, platform::CUDAPinnedPlace());
 }
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/stat_allocator.h b/paddle/fluid/memory/allocation/stat_allocator.h
index 71569366c2446..8b54b961596c2 100644
--- a/paddle/fluid/memory/allocation/stat_allocator.h
+++ b/paddle/fluid/memory/allocation/stat_allocator.h
@@ -30,16 +30,30 @@ class StatAllocator : public Allocator {
 
  protected:
   void FreeImpl(phi::Allocation* allocation) override {
-    MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
-                       -allocation->size());
+    if (platform::is_cpu_place(allocation->place())) {
+      HOST_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
+                              -allocation->size());
+    } else {
+      DEVICE_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
+                                -allocation->size());
+    }
+
     underlying_allocator_->Free(allocation);
   }
 
   phi::Allocation* AllocateImpl(size_t size) override {
     phi::Allocator::AllocationPtr allocation =
         underlying_allocator_->Allocate(size);
-    MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
-                       allocation->size());
+
+    const platform::Place& place = allocation->place();
+    if (platform::is_cpu_place(place) ||
+        platform::is_cuda_pinned_place(place)) {
+      HOST_MEMORY_STAT_UPDATE(Allocated, place.GetDeviceId(),
+                              allocation->size());
+    } else {
+      DEVICE_MEMORY_STAT_UPDATE(Allocated, place.GetDeviceId(),
+                                allocation->size());
+    }
     return allocation.release();
   }
 
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index 37ac0b4483291..e1077d66c54ec 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -15,6 +15,8 @@ limitations under the License. */
 
 #include "paddle/fluid/memory/detail/system_allocator.h"
 
+#include "paddle/fluid/memory/stats.h"
+
 #ifdef _WIN32
 #include <malloc.h>
 #ifndef NOMINMAX
@@ -92,6 +94,8 @@ void* CPUAllocator::Alloc(size_t* index, size_t size) {
     }
   }
 
+  HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
+
   return p;
 }
 
@@ -108,6 +112,8 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
 #else
   free(p);
 #endif
+
+  HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size);
 }
 
 bool CPUAllocator::UseGpu() const { return false; }
@@ -205,6 +211,7 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
   if (result == gpuSuccess) {
     *index = 1;  // PINNED memory
     cuda_pinnd_alloc_size_ += size;
+    HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
     return p;
   } else {
     LOG(WARNING) << "cudaHostAlloc failed.";
@@ -249,6 +256,7 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
             err));
   }
 #endif
+  HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size);
 }
 
 bool CUDAPinnedAllocator::UseGpu() const { return false; }
diff --git a/paddle/fluid/memory/memory_stats_test.cc b/paddle/fluid/memory/memory_stats_test.cc
new file mode 100644
index 0000000000000..b2fc602e401ed
--- /dev/null
+++ b/paddle/fluid/memory/memory_stats_test.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/memory.h"
+#include <algorithm>
+#include <vector>
+#include "gtest/gtest.h"
+
+namespace paddle {
+namespace memory {
+
+TEST(stat_allocator_test, host_memory_stat_test) {
+  std::vector<int64_t> alloc_sizes{
+      5278, 9593, 8492, 5041, 3351, 4232, 3706, 5963, 5896, 5057, 7527,
+      6235, 0,    7810, 940,  1239, 1945, 789,  2891, 7553, 8046, 2685,
+      1332, 6547, 5238, 5345, 1133, 5475, 9137, 3111, 8478, 6350, 9395,
+      4,    1185, 2186, 357,  9774, 6743, 6136, 7073, 7674, 5640, 3935,
+      528,  6699, 9821, 8717, 2264, 4708, 9936, 3566, 1373, 6955, 3694,
+      221,  309,  3617, 3793, 3334, 7281, 1302};
+
+  int64_t max_alloc_size = 0;
+  for (int64_t size : alloc_sizes) {
+    AllocationPtr allocation = Alloc(platform::CPUPlace(), size);
+    int64_t alloc_size = static_cast<int64_t>(allocation->size());
+    max_alloc_size = std::max(max_alloc_size, alloc_size);
+    EXPECT_EQ(HostMemoryStatCurrentValue("Allocated", 0), alloc_size);
+  }
+  EXPECT_EQ(HostMemoryStatPeakValue("Allocated", 0), max_alloc_size);
+}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+TEST(stat_allocator_test, device_memory_stat_test) {
+  std::vector<int64_t> alloc_sizes{
+      5278, 9593, 8492, 5041, 3351, 4232, 3706, 5963, 5896, 5057, 7527,
+      6235, 0,    7810, 940,  1239, 1945, 789,  2891, 7553, 8046, 2685,
+      1332, 6547, 5238, 5345, 1133, 5475, 9137, 3111, 8478, 6350, 9395,
+      4,    1185, 2186, 357,  9774, 6743, 6136, 7073, 7674, 5640, 3935,
+      528,  6699, 9821, 8717, 2264, 4708, 9936, 3566, 1373, 6955, 3694,
+      221,  309,  3617, 3793, 3334, 7281, 1302};
+
+  int64_t max_alloc_size = 0;
+  for (int64_t size : alloc_sizes) {
+    AllocationPtr allocation = Alloc(platform::CUDAPlace(), size);
+    int64_t alloc_size = static_cast<int64_t>(allocation->size());
+    max_alloc_size = std::max(max_alloc_size, alloc_size);
+    EXPECT_EQ(DeviceMemoryStatCurrentValue("Allocated", 0), alloc_size);
+  }
+  EXPECT_EQ(DeviceMemoryStatPeakValue("Allocated", 0), max_alloc_size);
+}
+#endif
+
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/stats.cc b/paddle/fluid/memory/stats.cc
index 31d776de40702..97197b495f5fc 100644
--- a/paddle/fluid/memory/stats.cc
+++ b/paddle/fluid/memory/stats.cc
@@ -38,7 +38,7 @@ class StatRegistry {
   }
 
   std::string GetStatKey(const std::string& stat_type, int dev_id) {
-    return "STAT_Device" + std::to_string(dev_id) + "_" + stat_type;
+    return stat_type + std::to_string(dev_id);
   }
 
   int64_t GetCurrentValue(const std::string& stat_type, int dev_id) {
@@ -49,6 +49,10 @@ class StatRegistry {
     return GetStat(stat_type, dev_id)->GetPeakValue();
   }
 
+  void Update(const std::string& stat_type, int dev_id, int64_t increment) {
+    GetStat(stat_type, dev_id)->Update(increment);
+  }
+
   void Register(const std::string& stat_type, int dev_id, StatBase* stat) {
     std::lock_guard<SpinLock> lock_guard(stat_map_lock_);
     stat_map_[GetStatKey(stat_type, dev_id)] = stat;
@@ -59,10 +63,6 @@ class StatRegistry {
     stat_map_.erase(GetStatKey(stat_type, dev_id));
   }
 
-  void Update(const std::string& stat_type, int dev_id, int64_t increment) {
-    stat_map_[GetStatKey(stat_type, dev_id)]->Update(increment);
-  }
-
  private:
   StatRegistry() = default;
 
@@ -72,43 +72,67 @@ class StatRegistry {
   SpinLock stat_map_lock_;
 };
 
-int64_t StatGetCurrentValue(const std::string& stat_type, int dev_id) {
-  return StatRegistry::GetInstance()->GetCurrentValue(stat_type, dev_id);
+int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id) {
+  return StatRegistry::GetInstance()->GetCurrentValue("Device" + stat_type,
+                                                      dev_id);
 }
 
-int64_t StatGetPeakValue(const std::string& stat_type, int dev_id) {
-  return StatRegistry::GetInstance()->GetPeakValue(stat_type, dev_id);
+int64_t DeviceMemoryStatPeakValue(const std::string& stat_type, int dev_id) {
+  return StatRegistry::GetInstance()->GetPeakValue("Device" + stat_type,
+                                                   dev_id);
 }
 
-void StatUpdate(const std::string& stat_type, int dev_id, int64_t increment) {
-  StatRegistry::GetInstance()->Update(stat_type, dev_id, increment);
+void DeviceMemoryStatUpdate(const std::string& stat_type, int dev_id,
+                            int64_t increment) {
+  StatRegistry::GetInstance()->Update("Device" + stat_type, dev_id, increment);
 }
 
-#define MEMORY_STAT_REGISTER_WITH_ID(item, id) \
-  StatRegistry::GetInstance()->Register(       \
-      #item, id, Stat<ThreadLocalStatDevice##id##item>::GetInstance());
-
-#define MEMORY_STAT_REGISTER(item)        \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 0);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 1);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 2);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 3);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 4);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 5);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 6);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 7);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 8);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 9);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 10); \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 11); \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 12); \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 13); \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 14); \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 15)
+int64_t HostMemoryStatCurrentValue(const std::string& stat_type, int dev_id) {
+  return StatRegistry::GetInstance()->GetCurrentValue("Host" + stat_type,
+                                                      dev_id);
+}
+
+int64_t HostMemoryStatPeakValue(const std::string& stat_type, int dev_id) {
+  return StatRegistry::GetInstance()->GetPeakValue("Host" + stat_type, dev_id);
+}
+
+void HostMemoryStatUpdate(const std::string& stat_type, int dev_id,
+                          int64_t increment) {
+  StatRegistry::GetInstance()->Update("Host" + stat_type, dev_id, increment);
+}
+
+#define DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, id) \
+  StatRegistry::GetInstance()->Register(              \
+      "Device" #item, id, Stat<DeviceMemoryStat##item##id>::GetInstance());
+
+#define DEVICE_MEMORY_STAT_REGISTER(item)        \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 0);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 1);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 2);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 3);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 4);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 5);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 6);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 7);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 8);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 9);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 10); \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 11); \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 12); \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 13); \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 14); \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 15)
+
+#define HOST_MEMORY_STAT_REGISTER(item)  \
+  StatRegistry::GetInstance()->Register( \
+      "Host" #item, 0, Stat<HostMemoryStat##item##0>::GetInstance());
 
 int RegisterAllStats() {
-  MEMORY_STAT_REGISTER(Allocated);
-  MEMORY_STAT_REGISTER(Reserved);
+  DEVICE_MEMORY_STAT_REGISTER(Allocated);
+  DEVICE_MEMORY_STAT_REGISTER(Reserved);
+
+  HOST_MEMORY_STAT_REGISTER(Allocated);
+  HOST_MEMORY_STAT_REGISTER(Reserved);
   return 0;
 }
 
diff --git a/paddle/fluid/memory/stats.h b/paddle/fluid/memory/stats.h
index b4850a8e9e919..bb6a3cca6644c 100644
--- a/paddle/fluid/memory/stats.h
+++ b/paddle/fluid/memory/stats.h
@@ -91,82 +91,113 @@ class Stat : public StatBase {
   std::atomic<int64_t> peak_value_{0};
 };
 
-// StatGetCurrentValue, StatGetPeakValue and StatUpdate support to operate STAT
-// values by a string, however, they has worse performance than the macro
-// function MEMORY_STAT_CURRENT_VALUE, MEMORY_STAT_PEAK_VALUE, and
-// MEMORY_STAT_UPDATE. Try to use the macro functions where ultra-low
-// performance overhead is required.
-int64_t StatGetCurrentValue(const std::string& stat_type, int dev_id);
-int64_t StatGetPeakValue(const std::string& stat_type, int dev_id);
-void StatUpdate(const std::string& stat_type, int dev_id, int64_t increment);
-
-#define MEMORY_STAT_FUNC_SWITHCH_CASE(item, id)                          \
-  case id:                                                               \
-    stat = paddle::memory::Stat<                                         \
-        paddle::memory::ThreadLocalStatDevice##id##item>::GetInstance(); \
+// xxxMemoryStatCurrentValue, xxxMemoryStatPeakValue and xxxMemoryStatUpdate
+// support to operate STAT values by a string, however, they has worse
+// performance than the macro function xxx_MEMORY_STAT_CURRENT_VALUE,
+// xxx_MEMORY_STAT_PEAK_VALUE, and xxx_MEMORY_STAT_UPDATE. Try to use the macro
+// functions where ultra-low performance overhead is required.
+int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id);
+int64_t DeviceMemoryStatPeakValue(const std::string& stat_type, int dev_id);
+void DeviceMemoryStatUpdate(const std::string& stat_type, int dev_id,
+                            int64_t increment);
+
+int64_t HostMemoryStatCurrentValue(const std::string& stat_type, int dev_id);
+int64_t HostMemoryStatPeakValue(const std::string& stat_type, int dev_id);
+void HostMemoryStatUpdate(const std::string& stat_type, int dev_id,
+                          int64_t increment);
+
+#define DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, id)              \
+  case id:                                                          \
+    stat = paddle::memory::Stat<                                    \
+        paddle::memory::DeviceMemoryStat##item##id>::GetInstance(); \
     break
 
-#define MEMORY_STAT_FUNC(item, id, func, ...)                         \
-  [&] {                                                               \
-    paddle::memory::StatBase* stat = nullptr;                         \
-    switch (id) {                                                     \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 0);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 1);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 2);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 3);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 4);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 5);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 6);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 7);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 8);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 9);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 10);                        \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 11);                        \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 12);                        \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 13);                        \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 14);                        \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 15);                        \
-      default:                                                        \
-        PADDLE_THROW(paddle::platform::errors::OutOfRange(            \
-            "Only support device id between [0, 15] in memory stats," \
-            "not support device id: %d",                              \
-            id));                                                     \
-        break;                                                        \
-    }                                                                 \
-    return stat->func(__VA_ARGS__);                                   \
+#define DEVICE_MEMORY_STAT_FUNC(item, id, func, ...)                          \
+  [&] {                                                                       \
+    paddle::memory::StatBase* stat = nullptr;                                 \
+    switch (id) {                                                             \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 0);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 1);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 2);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 3);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 4);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 5);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 6);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 7);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 8);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 9);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 10);                         \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 11);                         \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 12);                         \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 13);                         \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 14);                         \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 15);                         \
+      default:                                                                \
+        PADDLE_THROW(paddle::platform::errors::OutOfRange(                    \
+            "Only support device id between [0, 15] for device memory stats," \
+            "not support device id: %d",                                      \
+            id));                                                             \
+        break;                                                                \
+    }                                                                         \
+    return stat->func(__VA_ARGS__);                                           \
   }()
 
-#define MEMORY_STAT_CURRENT_VALUE(item, id) \
-  MEMORY_STAT_FUNC(item, id, GetCurrentValue)
-#define MEMORY_STAT_PEAK_VALUE(item, id) \
-  MEMORY_STAT_FUNC(item, id, GetPeakValue)
-#define MEMORY_STAT_UPDATE(item, id, increment) \
-  MEMORY_STAT_FUNC(item, id, Update, increment)
-
-#define MEMORY_STAT_DECLARE_WITH_ID(item, id) \
-  struct ThreadLocalStatDevice##id##item : public ThreadLocalStatBase {};
-
-#define MEMORY_STAT_DECLARE(item)        \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 0);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 1);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 2);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 3);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 4);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 5);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 6);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 7);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 8);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 9);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 10); \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 11); \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 12); \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 13); \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 14); \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 15)
+#define DEVICE_MEMORY_STAT_CURRENT_VALUE(item, id) \
+  DEVICE_MEMORY_STAT_FUNC(item, id, GetCurrentValue)
+#define DEVICE_MEMORY_STAT_PEAK_VALUE(item, id) \
+  DEVICE_MEMORY_STAT_FUNC(item, id, GetPeakValue)
+#define DEVICE_MEMORY_STAT_UPDATE(item, id, increment) \
+  DEVICE_MEMORY_STAT_FUNC(item, id, Update, increment)
+
+#define HOST_MEMORY_STAT_FUNC(item, id, func, ...)                           \
+  [&] {                                                                      \
+    PADDLE_ENFORCE_EQ(id, 0, paddle::platform::errors::OutOfRange(           \
+                                 "Only support device id 0 for host memory " \
+                                 "stats, not support device id: %d",         \
+                                 id));                                       \
+    return paddle::memory::Stat<                                             \
+               paddle::memory::HostMemoryStat##item##0>::GetInstance()       \
+        ->func(__VA_ARGS__);                                                 \
+  }()
+
+#define HOST_MEMORY_STAT_CURRENT_VALUE(item, id) \
+  HOST_MEMORY_STAT_FUNC(item, id, GetCurrentValue)
+#define HOST_MEMORY_STAT_PEAK_VALUE(item, id) \
+  HOST_MEMORY_STAT_FUNC(item, id, GetPeakValue)
+#define HOST_MEMORY_STAT_UPDATE(item, id, increment) \
+  HOST_MEMORY_STAT_FUNC(item, id, Update, increment)
+
+#define DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, id) \
+  struct DeviceMemoryStat##item##id : public ThreadLocalStatBase {}
+
+#define DEVICE_MEMORY_STAT_DECLARE(item)        \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 0);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 1);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 2);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 3);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 4);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 5);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 6);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 7);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 8);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 9);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 10); \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 11); \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 12); \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 13); \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 14); \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 15)
+
+// Only support id 0 for host memory stat
+#define HOST_MEMORY_STAT_DECLARE(item) \
+  struct HostMemoryStat##item##0 : public ThreadLocalStatBase{};
 
 // To add a new STAT type, declare here and register in stats.cc
-MEMORY_STAT_DECLARE(Allocated);
-MEMORY_STAT_DECLARE(Reserved);
+DEVICE_MEMORY_STAT_DECLARE(Allocated);
+DEVICE_MEMORY_STAT_DECLARE(Reserved);
+
+HOST_MEMORY_STAT_DECLARE(Allocated);
+HOST_MEMORY_STAT_DECLARE(Reserved);
 
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/stats_test.cc b/paddle/fluid/memory/stats_test.cc
index 436c737916d9f..bcaba8e91080f 100644
--- a/paddle/fluid/memory/stats_test.cc
+++ b/paddle/fluid/memory/stats_test.cc
@@ -23,50 +23,77 @@
 namespace paddle {
 namespace memory {
 
-TEST(stats_test, MultiThreadReadWriteTest) {
-  std::string stat_type = "Allocated";
-  size_t thread_num = 3;
-  size_t data_num = 10;
-
-  std::condition_variable cv;
-  std::mutex mutex;
-  std::vector<std::thread> threads;
-  size_t ready_thread_num = 0;
-
-  for (size_t i = 0; i < thread_num; ++i) {
-    threads.emplace_back(
-        [&stat_type, data_num, &cv, &mutex, &ready_thread_num]() {
-          for (size_t data = 0; data < data_num; ++data) {
-            StatUpdate(stat_type, 0, data);
-          }
-          /* lock guard*/ {
-            std::lock_guard<std::mutex> lock_guard{mutex};
-            ++ready_thread_num;
-            cv.notify_one();
-          }
-          // Sleep here to not exit before the main thread checking stat
-          // results, because the thread-local stat data will be destroyed when
-          // the thread exit
-          std::this_thread::sleep_for(std::chrono::seconds(1));
-        });
+class StatsTest : public ::testing::Test {
+ protected:
+  void SetStatType(const std::string& stat_type) { stat_type_ = stat_type; }
+
+  void SetFunc(
+      std::function<void(const std::string, int, int64_t)> update_func,
+      std::function<int64_t(const std::string, int)> current_value_func,
+      std::function<int64_t(const std::string, int)> peak_value_func) {
+    update_func_ = update_func;
+    current_value_func_ = current_value_func;
+    peak_value_func_ = peak_value_func;
+  }
+
+  void RunTests() {
+    MultiThreadReadWriteTest();
+    PeakValueTest();
   }
 
-  std::unique_lock<std::mutex> unique_lock(mutex);
-  cv.wait(unique_lock, [&ready_thread_num, thread_num]() {
-    return ready_thread_num == thread_num;
-  });
+ private:
+  void MultiThreadReadWriteTest() {
+    size_t thread_num = 3;
+    size_t data_num = 10;
+
+    std::condition_variable cv;
+    std::mutex mutex;
+    std::vector<std::thread> threads;
+    size_t ready_thread_num = 0;
+
+    for (size_t i = 0; i < thread_num; ++i) {
+      threads.emplace_back([&]() {
+        for (size_t data = 0; data < data_num; ++data) {
+          update_func_(stat_type_, 0, data);
+        }
+        /* lock guard*/ {
+          std::lock_guard<std::mutex> lock_guard{mutex};
+          ++ready_thread_num;
+          cv.notify_one();
+        }
+        // Sleep here to not exit before the main thread checking stat
+        // results, because the thread-local stat data will be destroyed when
+        // the thread exit
+        std::this_thread::sleep_for(std::chrono::seconds(1));
+      });
+    }
 
-  EXPECT_EQ(StatGetCurrentValue(stat_type, 0),
-            int64_t((thread_num * data_num * (data_num - 1)) >> 1));
+    std::unique_lock<std::mutex> unique_lock(mutex);
+    cv.wait(unique_lock, [&ready_thread_num, thread_num]() {
+      return ready_thread_num == thread_num;
+    });
 
-  for (size_t i = 0; i < thread_num; ++i) {
-    threads[i].join();
+    EXPECT_EQ(current_value_func_(stat_type_, 0),
+              int64_t((thread_num * data_num * (data_num - 1)) >> 1));
+
+    for (size_t i = 0; i < thread_num; ++i) {
+      threads[i].join();
+    }
+  }
+
+  void PeakValueTest() {
+    int64_t peak_value = ((int64_t)1) << 63;
+    int64_t sum = 0;
+    for (int64_t data : datas_) {
+      update_func_(stat_type_, 0, data);
+      sum += data;
+      peak_value = std::max(peak_value, sum);
+    }
+    EXPECT_EQ(peak_value_func_(stat_type_, 0), peak_value);
   }
-}
 
-TEST(stats_test, PeakValueTest) {
-  std::string stat_type = "Allocated";
-  std::vector<int64_t> datas = {
+  std::string stat_type_;
+  std::vector<int64_t> datas_{
       543149808935355, 634698327471328, 706215795436611, 577939367795333,
       419479490054362, 21975227714595,  812939817942250, 984428837942082,
       537304104446806, 685008544452453, 563352858161268, 690143831596330,
@@ -93,14 +120,53 @@ TEST(stats_test, PeakValueTest) {
       746465732805300, -74049761897414, -65640372433924, 852009039806484,
       305079802044257, -48409757869238, 266031781660228, 327287322379820};
 
-  int64_t peak_value = ((int64_t)1) << 63;
-  int64_t sum = 0;
-  for (int64_t data : datas) {
-    StatUpdate(stat_type, 0, data);
-    sum += data;
-    peak_value = std::max(peak_value, sum);
-  }
-  EXPECT_EQ(StatGetPeakValue(stat_type, 0), peak_value);
+  std::function<void(const std::string, int, int64_t)> update_func_;
+  std::function<int64_t(const std::string, int)> current_value_func_;
+  std::function<int64_t(const std::string, int)> peak_value_func_;
+};
+
+TEST_F(StatsTest, DeviceAllocatedTest) {
+  SetStatType("Allocated");
+  SetFunc(DeviceMemoryStatUpdate, DeviceMemoryStatCurrentValue,
+          DeviceMemoryStatPeakValue);
+  RunTests();
+}
+
+TEST_F(StatsTest, DeviceReservedMacroTest) {
+  SetStatType("Reserved");
+  SetFunc(
+      [](const std::string stat_type, int id, int64_t increment) {
+        return DEVICE_MEMORY_STAT_UPDATE(Reserved, id, increment);
+      },
+      [](const std::string stat_type, int id) {
+        return DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, id);
+      },
+      [](const std::string stat_type, int id) {
+        return DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, id);
+      });
+  RunTests();
+}
+
+TEST_F(StatsTest, HostAllocatedMacroTest) {
+  SetStatType("Allocated");
+  SetFunc(
+      [](const std::string stat_type, int id, int64_t increment) {
+        return HOST_MEMORY_STAT_UPDATE(Allocated, id, increment);
+      },
+      [](const std::string stat_type, int id) {
+        return HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, id);
+      },
+      [](const std::string stat_type, int id) {
+        return HOST_MEMORY_STAT_PEAK_VALUE(Allocated, id);
+      });
+  RunTests();
+}
+
+TEST_F(StatsTest, HostReservedTest) {
+  SetStatType("Reserved");
+  SetFunc(HostMemoryStatUpdate, HostMemoryStatCurrentValue,
+          HostMemoryStatPeakValue);
+  RunTests();
 }
 
 }  // namespace memory
diff --git a/paddle/fluid/operators/abs_op_mlu.cc b/paddle/fluid/operators/abs_op_mlu.cc
new file mode 100644
index 0000000000000..3a3a484ea775e
--- /dev/null
+++ b/paddle/fluid/operators/abs_op_mlu.cc
@@ -0,0 +1,75 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the Licnse. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class AbsMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+
+    output->mutable_data<T>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc input_desc(*input);
+    MLUCnnlTensorDesc output_desc(*output);
+
+    MLUCnnl::Abs(ctx, input_desc.get(), GetBasePtr(input), output_desc.get(),
+                 GetBasePtr(output));
+  }
+};
+
+template <typename T>
+class AbsGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    dx->mutable_data<T>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc input_desc(*x);
+    MLUCnnlOpTensorDesc mul_op_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(),
+                                    CNNL_NOT_PROPAGATE_NAN);
+
+    Tensor sign_x;
+    sign_x.mutable_data<T>(x->dims(), ctx.GetPlace());
+
+    MLUCnnl::Sign(ctx, input_desc.get(), GetBasePtr(x), input_desc.get(),
+                  GetBasePtr(&sign_x));
+    MLUCnnl::OpTensor(ctx, mul_op_desc.get(), input_desc.get(),
+                      GetBasePtr(&sign_x), input_desc.get(), GetBasePtr(dout),
+                      input_desc.get(), GetBasePtr(dx), ToCnnlDataType<T>());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(abs, ops::AbsMLUKernel<float>,
+                       ops::AbsMLUKernel<plat::float16>);
+
+REGISTER_OP_MLU_KERNEL(abs_grad, ops::AbsGradMLUKernel<float>,
+                       ops::AbsGradMLUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
index 5808841333f08..f9a93a47ff2be 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
@@ -161,8 +161,8 @@ class LazyZerosNPU {
     }
     auto place = dev_ctx.GetPlace();
     auto stream = dev_ctx.stream();
-    Tensor* zero_tensor;
-    void* zero_ptr;
+    Tensor* zero_tensor = nullptr;
+    void* zero_ptr = nullptr;
     if (found_inf_vec[0]) {
       int max_num = -1;
       for (size_t i = 0; i < xs.size(); ++i) {
diff --git a/paddle/fluid/operators/arg_max_op_npu.cc b/paddle/fluid/operators/arg_max_op_npu.cc
index 680183b6adf40..5c6b276c0172a 100644
--- a/paddle/fluid/operators/arg_max_op_npu.cc
+++ b/paddle/fluid/operators/arg_max_op_npu.cc
@@ -34,11 +34,18 @@ struct VisitDataArgNPUMaxFunctor {
     out.template mutable_data<Tout>(ctx.GetPlace());
     auto axis = ctx.Attr<int64_t>("axis");
     auto dtype = ctx.Attr<int>("dtype");
+    const bool& flatten = ctx.Attr<bool>("flatten");
+
+    Tensor transformed_x(x.type());
+    transformed_x.ShareDataWith(x);
+    if (flatten) {
+      transformed_x.Resize(phi::make_ddim({x.numel()}));
+    }
 
     auto stream = ctx.template device_context<NPUDeviceContext>().stream();
     NpuOpRunner runner;
     runner.SetType("ArgMaxV2")
-        .AddInput(x)
+        .AddInput(transformed_x)
         .AddInput(std::vector<int64_t>{axis})
         .AddOutput(out)
         .AddAttrDataType("dtype", dtype)
diff --git a/paddle/fluid/operators/class_center_sample_op.cu b/paddle/fluid/operators/class_center_sample_op.cu
index 1064c77cc0041..a23cf2815d8fe 100644
--- a/paddle/fluid/operators/class_center_sample_op.cu
+++ b/paddle/fluid/operators/class_center_sample_op.cu
@@ -416,14 +416,13 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
                    1) *
                   vec_size;
     int device_id = ctx.GetPlace().GetDeviceId();
-    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
-    if (gen_cuda->GetIsInitPy() && (!fix_seed)) {
+    auto gen_cuda = framework::DefaultCUDAGenerator(device_id);
+    if (!fix_seed) {
       auto seed_offset = gen_cuda->IncrementOffset(offset);
       seed_data = seed_offset.first;
       increment = seed_offset.second;
     } else {
-      std::random_device rnd;
-      seed_data = fix_seed ? seed + rank : rnd();
+      seed_data = seed + rank;
       increment = offset;
     }
     RandomSampleClassCenter<T><<<NumBlocks(num_classes), kNumCUDAThreads, 0,
diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc
index 900fd4d8d292e..aa5a38e4dbf08 100644
--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -24,6 +24,9 @@
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #endif
 #include "paddle/fluid/framework/convert_utils.h"
+#ifdef PADDLE_WITH_MLU
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#endif
 
 namespace paddle {
 namespace operators {
@@ -69,6 +72,13 @@ struct FillConstantVisitor {
       phi::funcs::SetConstant<DeviceContext, T> set_constant;
       set_constant(dev_ctx_, tensor_, static_cast<T>(value_));
     }
+#elif defined(PADDLE_WITH_MLU)
+    if (platform::is_mlu_place(context_.GetPlace())) {
+      FillMLUTensorWithHostValue<T>(context_, static_cast<T>(value_), tensor_);
+    } else {
+      phi::funcs::SetConstant<DeviceContext, T> set_constant;
+      set_constant(dev_ctx_, tensor_, static_cast<T>(value_));
+    }
 #else
     phi::funcs::SetConstant<DeviceContext, T> set_constant;
     set_constant(dev_ctx_, tensor_, static_cast<T>(value_));
@@ -509,6 +519,15 @@ REGISTER_OP_NPU_KERNEL(
     ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, double>);
 #endif
 
+#if defined(PADDLE_WITH_MLU)
+REGISTER_OP_MLU_KERNEL(
+    coalesce_tensor,
+    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext,
+                                plat::float16>,
+    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, float>);
+#endif
+
 REGISTER_OP_VERSION(coalesce_tensor)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
index 088366dbc8f69..6ad22ff8b19eb 100644
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
@@ -11,27 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/collective/c_sync_calc_stream_op.h"
 
 namespace paddle {
 namespace operators {
 
-class CSyncCalcStreamOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {}
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(framework::proto::VarType::FP32,
-                                   ctx.GetPlace());
-  }
-};
-
 class CSyncCalcStreamOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() {
@@ -45,53 +29,6 @@ Call calculation stream synchronization.
   }
 };
 
-template <typename T>
-class CSyncCalcStreamKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
-
-    auto place = ctx.GetPlace();
-    auto dev_ctx = static_cast<platform::CUDADeviceContext*>(
-        platform::DeviceContextPool::Instance().Get(place));
-
-    platform::GpuStreamSync(dev_ctx->stream());
-
-#elif defined(PADDLE_WITH_ASCEND_CL) && !defined(_WIN32)
-    auto place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_npu_place(place), true,
-                      platform::errors::PreconditionNotMet(
-                          "Sync stream op can run on npu place only for now."));
-
-    auto dev_ctx = static_cast<platform::NPUDeviceContext*>(
-        platform::DeviceContextPool::Instance().Get(place));
-    platform::NPUStreamSync(dev_ctx->stream());
-
-#elif defined(PADDLE_WITH_CNCL)
-    auto place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_mlu_place(place), true,
-                      platform::errors::PreconditionNotMet(
-                          "Sync stream op can run on mlu place only for now."));
-
-    auto dev_ctx = static_cast<platform::MLUDeviceContext*>(
-        platform::DeviceContextPool::Instance().Get(place));
-    platform::MLUStreamSync(dev_ctx->stream());
-#elif defined(PADDLE_WITH_XPU_BKCL)
-    auto place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_xpu_place(place), true,
-                      platform::errors::PreconditionNotMet(
-                          "Sync stream op can run on xpu place only for now."));
-
-    auto dev_ctx = static_cast<platform::XPUDeviceContext*>(
-        platform::DeviceContextPool::Instance().Get(place));
-    dev_ctx->Wait();
-#else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with GPU."));
-#endif
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
 
@@ -105,5 +42,3 @@ REGISTER_OP_CUDA_KERNEL(c_sync_calc_stream, ops::CSyncCalcStreamKernel<float>);
 REGISTER_OP_NPU_KERNEL(c_sync_calc_stream, ops::CSyncCalcStreamKernel<float>);
 
 REGISTER_OP_MLU_KERNEL(c_sync_calc_stream, ops::CSyncCalcStreamKernel<float>);
-
-REGISTER_OP_XPU_KERNEL(c_sync_calc_stream, ops::CSyncCalcStreamKernel<float>);
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.h b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h
new file mode 100644
index 0000000000000..b07367f801fa3
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h
@@ -0,0 +1,83 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <string>
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class CSyncCalcStreamOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(framework::proto::VarType::FP32,
+                                   ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class CSyncCalcStreamKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
+
+    auto place = ctx.GetPlace();
+    auto dev_ctx = static_cast<platform::CUDADeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(place));
+
+    platform::GpuStreamSync(dev_ctx->stream());
+
+#elif defined(PADDLE_WITH_ASCEND_CL) && !defined(_WIN32)
+    auto place = ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(platform::is_npu_place(place), true,
+                      platform::errors::PreconditionNotMet(
+                          "Sync stream op can run on npu place only for now."));
+
+    auto dev_ctx = static_cast<platform::NPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(place));
+    platform::NPUStreamSync(dev_ctx->stream());
+
+#elif defined(PADDLE_WITH_CNCL)
+    auto place = ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(platform::is_mlu_place(place), true,
+                      platform::errors::PreconditionNotMet(
+                          "Sync stream op can run on mlu place only for now."));
+
+    auto dev_ctx = static_cast<platform::MLUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(place));
+    platform::MLUStreamSync(dev_ctx->stream());
+#elif defined(PADDLE_WITH_XPU_BKCL)
+    auto place = ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(platform::is_xpu_place(place), true,
+                      platform::errors::PreconditionNotMet(
+                          "Sync stream op can run on xpu place only for now."));
+
+    auto dev_ctx = static_cast<platform::XPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(place));
+    dev_ctx->Wait();
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with GPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/phi/core/storage.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op_xpu.cc
similarity index 65%
rename from paddle/phi/core/storage.cc
rename to paddle/fluid/operators/collective/c_sync_calc_stream_op_xpu.cc
index 0ddf5084464cc..04a83ea64f076 100644
--- a/paddle/phi/core/storage.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op_xpu.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,14 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/core/storage.h"
+#include "paddle/fluid/operators/collective/c_sync_calc_stream_op.h"
 
-namespace phi {
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 
-void TensorStorage::Realloc(size_t size) {
-  this->Clear();
-  data_ = alloc_->Allocate(size);
-  size_ = size;
-}
-
-}  // namespace phi
+REGISTER_OP_XPU_KERNEL(c_sync_calc_stream, ops::CSyncCalcStreamKernel<float>)
diff --git a/paddle/fluid/operators/collective/global_gather_op.cu.cc b/paddle/fluid/operators/collective/global_gather_op.cu.cc
index 6684470e881cb..c256063090cc8 100644
--- a/paddle/fluid/operators/collective/global_gather_op.cu.cc
+++ b/paddle/fluid/operators/collective/global_gather_op.cu.cc
@@ -22,10 +22,10 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
+
 template <typename T>
-class GlobalGatherOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
+struct GlobalGatherFunctor<phi::GPUContext, T> {
+  void operator()(const framework::ExecutionContext& ctx) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #if NCCL_VERSION_CODE >= 2703
     auto x = ctx.Input<framework::LoDTensor>("X");
@@ -137,6 +137,132 @@ class GlobalGatherOpCUDAKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename T>
+struct GlobalGatherProcessGroupFunctor<phi::GPUContext, T> {
+  void operator()(const framework::ExecutionContext& ctx) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if NCCL_VERSION_CODE >= 2703
+    auto x = ctx.Input<framework::LoDTensor>("X");
+    auto local_count = ctx.Input<framework::LoDTensor>("local_count");
+    auto global_count = ctx.Input<framework::LoDTensor>("global_count");
+    auto local_count_type =
+        framework::TransToProtoVarType(local_count->dtype());
+    auto global_count_type =
+        framework::TransToProtoVarType(global_count->dtype());
+    if (local_count_type != framework::proto::VarType::INT64) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Please use int64 type in local_count."));
+    }
+    if (global_count_type != framework::proto::VarType::INT64) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Please use int64 type in global_count."));
+    }
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+    const int64_t* cpu_local_count_data;
+    const int64_t* cpu_global_count_data;
+    auto local_count_len = 0;
+
+    framework::Tensor cpu_local_count;
+    if (platform::is_cpu_place(local_count->place())) {
+      cpu_local_count_data = local_count->data<int64_t>();
+      local_count_len = local_count->numel();
+    } else {
+      framework::TensorCopySync(*local_count, platform::CPUPlace(),
+                                &cpu_local_count);
+      cpu_local_count_data = cpu_local_count.data<int64_t>();
+      local_count_len = cpu_local_count.numel();
+    }
+
+    framework::Tensor cpu_global_count;
+    if (platform::is_cpu_place(global_count->place())) {
+      cpu_global_count_data = global_count->data<int64_t>();
+    } else {
+      framework::TensorCopySync(*global_count, platform::CPUPlace(),
+                                &cpu_global_count);
+      cpu_global_count_data = cpu_global_count.data<int64_t>();
+    }
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    PADDLE_ENFORCE_GE(
+        ring_id, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for global gather op must be non-negative.",
+            ring_id));
+    auto place = ctx.GetPlace();
+
+    auto map = distributed::ProcessGroupMapFromGid::getInstance();
+    distributed::ProcessGroup* pg = map->get(ring_id);
+
+    int nranks = pg->GetSize();
+    auto in_feat = x->dims()[1];
+    auto n_expert = local_count->dims()[0] / nranks;
+
+    auto fwd_count = 0;
+
+    for (auto i = 0; i < local_count_len; ++i) {
+      fwd_count += cpu_local_count_data[i];
+    }
+    framework::DDim out_dims = phi::make_ddim({fwd_count, in_feat});
+    int64_t* expert_ptr = new int64_t[n_expert * nranks];
+    expert_ptr[0] = 0;
+    auto tot_experts = n_expert * nranks;
+    for (auto i = 1; i < tot_experts; ++i) {
+      expert_ptr[i] = expert_ptr[i - 1] + cpu_local_count_data[i - 1];
+    }
+    auto send_ptr = 0;
+    out->mutable_data<T>(out_dims, place);
+
+    for (auto i = 0; i < n_expert; ++i) {
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+      for (auto j = 0; j < nranks; ++j) {
+        int idx = i + j * n_expert;
+        if (cpu_global_count_data[idx]) {
+          phi::DenseTensor tmp = *x;
+          pg->Send_Partial(tmp, j, send_ptr * in_feat,
+                           cpu_global_count_data[idx] * in_feat);
+          send_ptr += cpu_global_count_data[idx];
+        }
+        if (cpu_local_count_data[idx]) {
+          pg->Recv_Partial(*out, j, expert_ptr[idx] * in_feat,
+                           cpu_local_count_data[idx] * in_feat);
+        }
+      }
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+    }
+
+#ifdef PADDLE_WITH_CUDA
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
+#endif
+
+#else
+    PADDLE_THROW(
+        platform::errors::Unavailable("NCCL version >= 2.7.3 is needed."));
+#endif
+#else
+    PADDLE_THROW(
+        platform::errors::Unavailable("PaddlePaddle should compile with GPU."));
+#endif
+  }
+};
+
+template <typename T>
+class GlobalGatherOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const int rid = ctx.Attr<int>("ring_id");
+    auto map = distributed::ProcessGroupMapFromGid::getInstance();
+    if (map->has(rid)) {
+      GlobalGatherProcessGroupFunctor<phi::GPUContext, T> functor_;
+      functor_(ctx);
+    } else {
+      GlobalGatherFunctor<phi::GPUContext, T> functor_;
+      functor_(ctx);
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/collective/global_gather_op.h b/paddle/fluid/operators/collective/global_gather_op.h
index 3ff2df9e48f3d..47212b1d15581 100644
--- a/paddle/fluid/operators/collective/global_gather_op.h
+++ b/paddle/fluid/operators/collective/global_gather_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -33,5 +34,15 @@ class GlobalGatherOpCPUKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename Context, typename T>
+struct GlobalGatherFunctor {
+  void operator()(const framework::ExecutionContext& ctx);
+};
+
+template <typename Context, typename T>
+struct GlobalGatherProcessGroupFunctor {
+  void operator()(const framework::ExecutionContext& ctx);
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/collective/global_scatter_op.cu.cc b/paddle/fluid/operators/collective/global_scatter_op.cu.cc
index cd3c3a3229ca0..df8d675ec9d71 100644
--- a/paddle/fluid/operators/collective/global_scatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/global_scatter_op.cu.cc
@@ -22,10 +22,10 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
+
 template <typename T>
-class GlobalScatterOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
+struct GlobalScatterFunctor<phi::GPUContext, T> {
+  void operator()(const framework::ExecutionContext& ctx) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #if NCCL_VERSION_CODE >= 2703
     auto x = ctx.Input<framework::LoDTensor>("X");
@@ -137,6 +137,130 @@ class GlobalScatterOpCUDAKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename T>
+struct GlobalScatterProcessGroupFunctor<phi::GPUContext, T> {
+  void operator()(const framework::ExecutionContext& ctx) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if NCCL_VERSION_CODE >= 2703
+    auto x = ctx.Input<framework::LoDTensor>("X");
+    auto local_count = ctx.Input<framework::LoDTensor>("local_count");
+    auto global_count = ctx.Input<framework::LoDTensor>("global_count");
+    auto local_count_type =
+        framework::TransToProtoVarType(local_count->dtype());
+    auto global_count_type =
+        framework::TransToProtoVarType(global_count->dtype());
+    if (local_count_type != framework::proto::VarType::INT64) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Please use int64 type in local_count."));
+    }
+    if (global_count_type != framework::proto::VarType::INT64) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Please use int64 type in global_count."));
+    }
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+    const int64_t* cpu_local_count_data;
+    const int64_t* cpu_global_count_data;
+    framework::Tensor cpu_local_count;
+    if (platform::is_cpu_place(local_count->place())) {
+      cpu_local_count_data = local_count->data<int64_t>();
+    } else {
+      framework::TensorCopySync(*local_count, platform::CPUPlace(),
+                                &cpu_local_count);
+      cpu_local_count_data = cpu_local_count.data<int64_t>();
+    }
+    auto global_count_len = 0;
+    framework::Tensor cpu_global_count;
+    if (platform::is_cpu_place(global_count->place())) {
+      cpu_global_count_data = global_count->data<int64_t>();
+      global_count_len = global_count->numel();
+    } else {
+      framework::TensorCopySync(*global_count, platform::CPUPlace(),
+                                &cpu_global_count);
+      cpu_global_count_data = cpu_global_count.data<int64_t>();
+      global_count_len = cpu_global_count.numel();
+    }
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    PADDLE_ENFORCE_GE(
+        ring_id, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for global scatter op must be non-negative.",
+            ring_id));
+
+    auto place = ctx.GetPlace();
+
+    auto map = distributed::ProcessGroupMapFromGid::getInstance();
+    distributed::ProcessGroup* pg = map->get(ring_id);
+    int nranks = pg->GetSize();
+    auto in_feat = x->dims()[1];
+    auto n_expert = local_count->dims()[0] / nranks;
+    int64_t fwd_count = 0;
+
+    for (auto i = 0; i < global_count_len; ++i) {
+      fwd_count += cpu_global_count_data[i];
+    }
+    framework::DDim out_dims = phi::make_ddim({fwd_count, in_feat});
+    int64_t* expert_ptr = new int64_t[n_expert * nranks];
+    expert_ptr[0] = 0;
+    auto tot_experts = n_expert * nranks;
+    for (auto i = 1; i < tot_experts; ++i) {
+      expert_ptr[i] = expert_ptr[i - 1] + cpu_local_count_data[i - 1];
+    }
+
+    auto recv_ptr = 0;
+    out->mutable_data<T>(out_dims, place);
+
+    for (auto i = 0; i < n_expert; ++i) {
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+      for (auto j = 0; j < nranks; ++j) {
+        int idx = i + j * n_expert;
+        if (cpu_local_count_data[idx]) {
+          phi::DenseTensor tmp = *x;
+          pg->Send_Partial(tmp, j, expert_ptr[idx] * in_feat,
+                           cpu_local_count_data[idx] * in_feat);
+        }
+        if (cpu_global_count_data[idx]) {
+          pg->Recv_Partial(*out, j, recv_ptr * in_feat,
+                           cpu_global_count_data[idx] * in_feat);
+          recv_ptr += cpu_global_count_data[idx];
+        }
+      }
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+    }
+
+#ifdef PADDLE_WITH_CUDA
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
+#endif
+
+#else
+    PADDLE_THROW(
+        platform::errors::Unavailable("NCCL version >= 2.7.3 is needed."));
+#endif
+#else
+    PADDLE_THROW(
+        platform::errors::Unavailable("PaddlePaddle should compile with GPU."));
+#endif
+  }
+};
+
+template <typename T>
+class GlobalScatterOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const int rid = ctx.Attr<int>("ring_id");
+    auto map = distributed::ProcessGroupMapFromGid::getInstance();
+    if (map->has(rid)) {
+      GlobalScatterProcessGroupFunctor<phi::GPUContext, T> functor_;
+      functor_(ctx);
+    } else {
+      GlobalScatterFunctor<phi::GPUContext, T> functor_;
+      functor_(ctx);
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/collective/global_scatter_op.h b/paddle/fluid/operators/collective/global_scatter_op.h
index 52b486aef25c2..aa567a284a6f7 100644
--- a/paddle/fluid/operators/collective/global_scatter_op.h
+++ b/paddle/fluid/operators/collective/global_scatter_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -33,5 +34,15 @@ class GlobalScatterOpCPUKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename Context, typename T>
+struct GlobalScatterFunctor {
+  void operator()(const framework::ExecutionContext& ctx);
+};
+
+template <typename Context, typename T>
+struct GlobalScatterProcessGroupFunctor {
+  void operator()(const framework::ExecutionContext& ctx);
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/compat/conv2d.pbtxt b/paddle/fluid/operators/compat/conv2d.pbtxt
index ca07d4a36ff3c..8de061a3cc2f6 100644
--- a/paddle/fluid/operators/compat/conv2d.pbtxt
+++ b/paddle/fluid/operators/compat/conv2d.pbtxt
@@ -77,14 +77,6 @@ extra {
     name: "fuse_relu"
     type: BOOLEAN
   }
-  attrs {
-    name: "fuse_brelu"
-    type: BOOLEAN
-  }
-  attrs {
-    name: "fuse_brelu_threshold"
-    type: FLOAT
-  }
   attrs {
     name: "fuse_activation"
     type: STRING
@@ -134,4 +126,3 @@ extra {
     type: BOOLEAN
   }
 }
-
diff --git a/paddle/fluid/operators/compat/depthwise_conv2d.pbtxt b/paddle/fluid/operators/compat/depthwise_conv2d.pbtxt
index ded143986159f..1fbb99c03e833 100644
--- a/paddle/fluid/operators/compat/depthwise_conv2d.pbtxt
+++ b/paddle/fluid/operators/compat/depthwise_conv2d.pbtxt
@@ -69,14 +69,6 @@ extra {
     name: "fuse_relu"
     type: BOOLEAN
   }
-  attrs {
-    name: "fuse_brelu"
-    type: BOOLEAN
-  }
-  attrs {
-    name: "fuse_brelu_threshold"
-    type: FLOAT
-  }
   attrs {
     name: "fuse_activation"
     type: STRING
@@ -126,4 +118,3 @@ extra {
     type: BOOLEAN
   }
 }
-
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.cc b/paddle/fluid/operators/controlflow/conditional_block_op.cc
index 6bf419c47a566..7ffbf1933be37 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.cc
@@ -17,6 +17,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/assign_op.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -65,6 +69,12 @@ class ConditionalBlockOp : public ConditionalOp {
       scopes->resize(1);
       scopes->front() = &scope.NewScope();
       auto &cur_scope = *scopes->front();
+#ifdef PADDLE_WITH_MKLDNN
+      // (jczaja) Executor on being destroyed clears oneDNN cache and
+      // reset registered model data layout. This is unwanted for nested
+      // Executors (executors declared inside control ops)
+      platform::DontClearMKLDNNCache(dev_place);
+#endif
       framework::Executor exec(dev_place);
       auto *block = Attr<framework::BlockDesc *>("sub_block");
       VLOG(3) << "Conditional block.idx = " << block->ID()
@@ -143,7 +153,7 @@ class ConditionalBlockGradOp : public ConditionalOp {
                /* keep_kid_scopes */ false);
 
       AssignLocalGradientToParentScope(dev_place, cur_scope, scope,
-                                       inside_grads, outside_grads);
+                                       inside_grads, outside_grads, inputs);
       return;
     }
 
@@ -155,27 +165,36 @@ class ConditionalBlockGradOp : public ConditionalOp {
       const platform::Place &place, const framework::Scope &cur_scope,
       const framework::Scope &parent_scope,
       const std::vector<std::string> &inside_grads,
-      const std::vector<std::string> &outside_grads) const {
+      const std::vector<std::string> &outside_grads,
+      const std::vector<std::string> &inputs) const {
+    std::vector<std::string> assign_zero_outside_grads;
+    std::vector<std::string> assign_zero_inputs;
     for (size_t i = 0; i < outside_grads.size(); ++i) {
       const std::string &outside_grad_name = outside_grads[i];
       const std::string &inside_grad_name = inside_grads[i];
       VLOG(4) << "inside_grad_name = " << inside_grad_name
               << ", outside_grad_name = " << outside_grad_name;
-      framework::Variable *inside_var =
-          cur_scope.FindLocalVar(inside_grad_name);
-      if (inside_var == nullptr) {
-        continue;
-      }
       framework::Variable *outside_var =
           parent_scope.FindVar(outside_grad_name);
       if (outside_var == nullptr) {
         continue;
       }
+      framework::Variable *inside_var =
+          cur_scope.FindLocalVar(inside_grad_name);
+      if (inside_var == nullptr) {
+        assign_zero_outside_grads.emplace_back(outside_grad_name);
+        assign_zero_inputs.emplace_back(inputs[i]);
+        continue;
+      }
       platform::DeviceContext *dev_ctx =
           platform::DeviceContextPool::Instance().Get(place);
       framework::VisitVarType(*inside_var,
                               AssignFunctor(outside_var, *dev_ctx));
     }
+    // Assign zero to the grad_vars that are in outside_grads but not in
+    // inside_grads
+    AssignZeroToParentScope(place, parent_scope, assign_zero_inputs,
+                            assign_zero_outside_grads);
   }
 
   void AssignZeroToParentScope(
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index eb44655c88f18..d8daa25f31be8 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -17,6 +17,9 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/controlflow/while_op_helper.h"
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 namespace paddle {
 namespace framework {
 class InferShapeContext;
@@ -66,6 +69,12 @@ class WhileOp : public framework::OperatorBase {
             "the Condition's shape is ",
             cond.dims().to_str(), ".\n"));
 
+#ifdef PADDLE_WITH_MKLDNN
+    // (jczaja) Executor on being destroyed clears oneDNN cache and
+    // resets registered model data layout. This is unwanted for nested
+    // Executors (executors declared inside control ops)
+    platform::DontClearMKLDNNCache(dev_place);
+#endif
     framework::Executor executor(dev_place);
     auto *block = Attr<framework::BlockDesc *>(kStepBlock);
 
diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h
index 419fb8a4ca703..3044aa6cf6c5a 100644
--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
@@ -72,8 +72,10 @@ static inline bool UseFixedWorkspace() {
 static size_t CalcWorkspaceLimitInBytes(bool use_fixed_workspace) {
   if (!use_fixed_workspace) {
     int device_id = platform::GetCurrentDeviceId();
-    int64_t allocated = memory::StatGetCurrentValue("Allocated", device_id);
-    int64_t reserved = memory::StatGetCurrentValue("Reserved", device_id);
+    int64_t allocated =
+        memory::DeviceMemoryStatCurrentValue("Allocated", device_id);
+    int64_t reserved =
+        memory::DeviceMemoryStatCurrentValue("Reserved", device_id);
     int64_t availble = platform::GpuAvailableMemToAlloc();
     VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated)
             << " MB, reserved=" << ToMegaBytes(reserved)
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 405794783812b..f084862b419d5 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -348,14 +348,6 @@ void Conv2DOpMaker::Make() {
   AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false)
       .AsExtra();
-  AddAttr<bool>("fuse_brelu",
-                "(bool, default false) Only used in mkldnn kernel")
-      .SetDefault(false)
-      .AsExtra();
-  AddAttr<float>("fuse_brelu_threshold",
-                 "(float, default false 6.0) Only used in mkldnn kernel")
-      .SetDefault(6.0f)
-      .AsExtra();
   AddAttr<std::string>("fuse_activation",
                        "(string, default \"\") Only used in mkldnn kernel")
       .SetDefault("")
diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc
index f2beb4cec212e..9de5bc6ea3636 100644
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
@@ -97,7 +97,7 @@ Crop Operator.
 Crop input into output, as specified by offsets and shape.
 
 There are two ways to set the offsets:
-1. In runtime: Using the input 'Offsets', which is a Vairbale and can be 
+1. In runtime: Using the input 'Offsets', which is a Variable and can be 
                output of other operators. This way is suitable for 
                dynamic offsets.
 2. In network configuration: Using the attribute 'offsets', which will be 
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
index 972dea38f5746..798fd93006620 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -172,17 +172,13 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
     int seed = ctx.Attr<int>("seed");
 
     if (!is_test) {
-      int device_id = ctx.GetPlace().GetDeviceId();
-      auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
-      if (gen_cuda->GetIsInitPy() && seed == 0) {
-        // If perform `manual_seed` in python and inner seed is not specified
-        // (equals 0), use global generator generated seed.
+      if (seed == 0) {
+        // If not specify seed, use global Generator to generate seed.
+        int device_id = ctx.GetPlace().GetDeviceId();
+        auto gen_cuda = paddle::framework::DefaultCUDAGenerator(device_id);
         seed = static_cast<int>(gen_cuda->Random64());
-      } else if (seed == 0) {
-        // use random generated seed
-        std::random_device rd;
-        seed = rd();
-      }  // else use `ctx.Attr<int>("seed")` specified seed
+      }
+      // else use `ctx.Attr<int>("seed")` specified seed
     }
 
     bool has_seq_length = ctx.HasInput("SequenceLength");
diff --git a/paddle/fluid/operators/dgc_op.h b/paddle/fluid/operators/dgc_op.h
index ac9c440076257..b1bf5e2778167 100644
--- a/paddle/fluid/operators/dgc_op.h
+++ b/paddle/fluid/operators/dgc_op.h
@@ -15,9 +15,11 @@ limitations under the License. */
 #pragma once
 #include <vector>
 #include "dgc/dgc.h"
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
 
 namespace paddle {
 namespace operators {
@@ -153,18 +155,18 @@ class DGCOpKernel : public framework::OpKernel<T> {
       u_out_e.device(eigen_ctx) = m * (u_e + grad_out_e);
 
       // v = u + v + g
-      ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(
-          ctx, u, v, 0, AddFunctor<T>(), v_out);
+      ElementwiseComputeEx<phi::funcs::AddFunctor<T>, DeviceContext, T>(
+          ctx, u, v, 0, phi::funcs::AddFunctor<T>(), v_out);
 
-      ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(
-          ctx, g, v, 0, AddFunctor<T>(), v_out);
+      ElementwiseComputeEx<phi::funcs::AddFunctor<T>, DeviceContext, T>(
+          ctx, g, v, 0, phi::funcs::AddFunctor<T>(), v_out);
     } else {
       // u = m * u + g
       u_out_e.device(eigen_ctx) = m * u_e + grad_out_e;
 
       // v = u + v
-      ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(
-          ctx, u, v, 0, AddFunctor<T>(), v_out);
+      ElementwiseComputeEx<phi::funcs::AddFunctor<T>, DeviceContext, T>(
+          ctx, u, v, 0, phi::funcs::AddFunctor<T>(), v_out);
     }
 
     T* v_out_data = v_out->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/dirichlet_op.cu b/paddle/fluid/operators/dirichlet_op.cu
index 63f9c7339bfc5..ac6480a8fa1c6 100644
--- a/paddle/fluid/operators/dirichlet_op.cu
+++ b/paddle/fluid/operators/dirichlet_op.cu
@@ -77,7 +77,7 @@ struct DirichletSampler<platform::CUDADeviceContext, T> {
 
     // init state, seed & offset for all threads
     int device_id = ctx.GetPlace().GetDeviceId();
-    auto p_gen = framework::GetDefaultCUDAGenerator(device_id);
+    auto p_gen = framework::DefaultCUDAGenerator(device_id);
     auto seed_and_offset = p_gen->IncrementOffset(10);  // hard-coded offset
     auto seed = seed_and_offset.first;
     auto offset = seed_and_offset.second;
diff --git a/paddle/fluid/operators/dropout_impl_util.h b/paddle/fluid/operators/dropout_impl_util.h
index c62d45570ba29..571a1c97c52e8 100644
--- a/paddle/fluid/operators/dropout_impl_util.h
+++ b/paddle/fluid/operators/dropout_impl_util.h
@@ -26,7 +26,7 @@ inline void GetSeedDataAndIncrement(const phi::GPUContext& dev_ctx,
                                     const int offset, uint64_t* seed_data,
                                     uint64_t* increment) {
   int device_id = dev_ctx.GetPlace().GetDeviceId();
-  auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+  auto gen_cuda = framework::DefaultCUDAGenerator(device_id);
 
   if (seed) {
     framework::Tensor seed_cpu_tensor;
@@ -34,13 +34,12 @@ inline void GetSeedDataAndIncrement(const phi::GPUContext& dev_ctx,
                                       &seed_cpu_tensor);
     *seed_data = static_cast<uint64_t>(seed_cpu_tensor.data<int>()[0]);
     *increment = offset;
-  } else if (gen_cuda->GetIsInitPy() && (!is_fix_seed)) {
+  } else if (!is_fix_seed) {
     auto seed_offset = gen_cuda->IncrementOffset(offset);
     *seed_data = seed_offset.first;
     *increment = seed_offset.second;
   } else {
-    std::random_device rnd;
-    *seed_data = is_fix_seed ? seed_val : rnd();
+    *seed_data = seed_val;
     *increment = offset;
   }
 }
diff --git a/paddle/fluid/operators/einsum_op.cc b/paddle/fluid/operators/einsum_op.cc
index 8fdde1ccdc058..6da0045443ccc 100644
--- a/paddle/fluid/operators/einsum_op.cc
+++ b/paddle/fluid/operators/einsum_op.cc
@@ -33,6 +33,13 @@ class EinsumOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Operands", "(TensorList), The input tensor of einsum op.")
         .AsDuplicable();
     AddOutput("Out", "(Tensor), The output tensor of einsum op.");
+    AddOutput(
+        "InnerCache",
+        "(Tensor), The cache of the forward transpose tensors: tA and tB.")
+        .AsDuplicable()
+        .AsExtra()
+        .AsIntermediate();
+
     AddAttr<std::string>("equation",
                          "(string) A einsum equation. such as `ij,jk->ik`"
                          "There must have `->` and the number of operands in "
@@ -72,6 +79,7 @@ class EinsumGradMaker : public framework::SingleGradOpMaker<T> {
   void Apply(GradOpPtr<T> retv) const override {
     retv->SetType("einsum_grad");
     retv->SetInput("Operands", this->Input("Operands"));
+    retv->SetInput("InnerCache", this->Output("InnerCache"));
     retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
     retv->SetAttrMap(this->Attrs());
     retv->SetOutput(framework::GradVarName("Operands"),
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
index 53037c1fa6536..ed9b98a128a21 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-
 #include <string>
 
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
@@ -125,17 +123,6 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 
-REGISTER_OP_CPU_KERNEL(
-    grad_add,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex<float>>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex<double>>);
-
 REGISTER_OP_VERSION(elementwise_add)
     .AddCheckpoint(
         R"ROC(Register elementwise_add for adding the attribute of
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
deleted file mode 100644
index d77d4ed036394..0000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#ifdef __xpu__
-#include <memory>
-#include <string>
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
-#include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
-#include "paddle/fluid/platform/device/device_wrapper.h"
-#else
-#include <algorithm>
-#include <utility>
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-
-// only can include the headers in paddle/phi/include dirs
-#include "paddle/phi/kernels/elementwise_add_grad_kernel.h"
-#include "paddle/phi/kernels/elementwise_add_kernel.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ElementwiseAddKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-#ifdef __xpu__
-    std::vector<const framework::Tensor*> ins;
-    std::vector<framework::Tensor*> outs;
-    int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs);
-    const auto& xpu_ctx =
-        ctx.template device_context<paddle::platform::XPUDeviceContext>();
-    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
-                                                   T, kps::AddFunctor<T>, 1>(
-        xpu_ctx, ins, &outs, axis, kps::AddFunctor<T>());
-#else
-    auto *x = ctx.Input<framework::LoDTensor>("X");
-    auto *y = ctx.Input<framework::LoDTensor>("Y");
-    auto *z = ctx.Output<framework::LoDTensor>("Out");
-    z->mutable_data<T>(ctx.GetPlace());
-
-    auto &dev_ctx = ctx.device_context<DeviceContext>();
-    int axis = ctx.Attr<int>("axis");
-    phi::AddRawKernel<T>(
-        static_cast<const typename framework::ConvertToPhiContext<
-            DeviceContext>::TYPE &>(dev_ctx),
-        *x, *y, axis, z);
-#endif
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.kps b/paddle/fluid/operators/elementwise/elementwise_add_op.kps
deleted file mode 100644
index ecd52a310acdb..0000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.kps
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_XPU_KP
-
-// Please do not modify the following code
-#if defined(__CUDA_ARCH__)
-#undef __CUDA_ARCH__
-#endif
-
-#if defined(__CUDACC__)
-#undef __CUDACC__
-#endif
-
-#if defined(__CUDA__)
-#undef __CUDA__
-#endif
-
-#if defined(__NVCC__)
-#undef __NVCC__
-#endif
-
-#include <xpu/runtime.h>                // NOLINT
-#include "xpu/kernel/cluster_header.h"  // NOLINT
-#include "xpu/kernel/debug.h"           // NOLINT
-#include "xpu/kernel/math.h"            // NOLINT
-
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#else
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include "paddle/phi/kernels/gpu/elementwise_grad.h"
-#endif
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-#ifdef PADDLE_WITH_XPU_KP
-REGISTER_OP_KERNEL(elementwise_add, KP, plat::XPUPlace,
-                   ops::ElementwiseAddKernel<plat::XPUDeviceContext, float>);
-#else
-REGISTER_OP_CUDA_KERNEL(
-    grad_add, ops::ElementwiseAddKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::bfloat16>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::complex<float>>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::complex<double>>);
-#endif
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
index d35e3f6641b45..178aa329577b7 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_npu.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
index feb73abf3ff08..22a5de4c60941 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include <memory>
 #include <string>
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
index 3e9263fe93acd..39a80e9571b29 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
@@ -75,7 +75,7 @@ void Compare(f::Scope *scope, const p::DeviceContext &ctx,
   paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
 
   ctx.Wait();
-  float expected;
+  float expected = 0.0;
   if (op_type == "elementwise_add") {
     expected = 3.0;
   } else if (op_type == "elementwise_sub") {
@@ -133,7 +133,7 @@ void CompareGrad(f::Scope *scope, const p::DeviceContext &ctx,
   paddle::framework::TensorToVector(*tensor_dy, ctx, &dy_vec);
 
   ctx.Wait();
-  float expected_x, expected_y;
+  float expected_x = 0, expected_y = 0;
   if (op_type == "elementwise_add_grad") {
     expected_x = 1.0;
     expected_y = 6.0;
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
index d1a1aa3008c8b..070bf9511a9fe 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
@@ -145,8 +145,7 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
     binary_prim->execute(astream, args);
     astream.wait();
 
-    z->set_layout(DataLayout::kMKLDNN);
-    z->set_format(platform::GetMKLDNNFormat(*dst_memory));
+    z->set_mem_desc(dst_memory->get_desc());
   }
 };
 
@@ -179,7 +178,7 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
         onednn_engine);
 
     auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
-        dout->format(), platform::to_void_cast(dout->data<T>()));
+        dout->mem_desc(), platform::to_void_cast(dout->data<T>()));
 
     auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
 
@@ -189,7 +188,7 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
       // elementwise_add & elementwise_sub
       if (BINARY_OP == dnnl::algorithm::binary_add ||
           BINARY_OP == dnnl::algorithm::binary_sub) {
-        dst_memory = reorder_handler.AcquireDstMemory(dx, dout->format(),
+        dst_memory = reorder_handler.AcquireDstMemory(dx, dout->mem_desc(),
                                                       ctx.GetPlace());
         auto reorder_p =
             reorder_handler.AcquireReorder(dst_memory, reorder_src_memory_p);
@@ -218,8 +217,7 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
       }
       astream.wait();
 
-      dx->set_layout(framework::DataLayout::kMKLDNN);
-      dx->set_format(platform::GetMKLDNNFormat(*dst_memory));
+      dx->set_mem_desc(dst_memory->get_desc());
     }
 
     if (dy) {
@@ -232,7 +230,7 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
           BINARY_OP == dnnl::algorithm::binary_sub) {
         if (dout->dims() == dy->dims()) {
           auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
-              dy, dout->format(), ctx.GetPlace());
+              dy, dout->mem_desc(), ctx.GetPlace());
 
           dnnl::primitive_attr reorder_attr;
           std::vector<float> scales(1);
@@ -301,7 +299,6 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
         dst_memory = dst_dy_memory;
       }
       astream.wait();
-      dy->set_layout(DataLayout::kMKLDNN);
 
       if (dout->dims() != dy->dims()) {
         // Broadcasting
@@ -324,10 +321,10 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
                                           {DNNL_ARG_DST, *dst_memory},
                                       });
         astream.wait();
-        dy->set_format(platform::GetMKLDNNFormat(dst_memory->get_desc().reshape(
-            phi::vectorize<int64_t>(dy->dims()))));
+        dy->set_mem_desc(dst_memory->get_desc().reshape(
+            phi::vectorize<int64_t>(dy->dims())));
       } else {
-        dy->set_format(platform::GetMKLDNNFormat(*dst_memory));
+        dy->set_mem_desc(dst_memory->get_desc());
       }
     }
   }
diff --git a/paddle/fluid/operators/fake_quantize_op.cu.h b/paddle/fluid/operators/fake_quantize_op.cu.h
index 6c068d25d07a8..a6130c272d72b 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu.h
+++ b/paddle/fluid/operators/fake_quantize_op.cu.h
@@ -217,16 +217,18 @@ __global__ void ClipAndQuantKernel(const T* in, const T* scale,
   int bid = threadIdx.x + blockIdx.x * blockDim.x;
   int tid = threadIdx.x;
 
-  T s = scale[0];
-  T inv_s = inverse(s);
-  T bin_cnt_t = static_cast<T>(bin_cnt);
+  using ComputeDataType = typename QuantizeDataType<T>::type;
+
+  ComputeDataType s = static_cast<ComputeDataType>(scale[0]);
+  ComputeDataType inv_s = inverse(s);
+  ComputeDataType bin_cnt_t = static_cast<ComputeDataType>(bin_cnt);
+
   for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
-    T x = in[i];
-    T v = x > s ? s : x;
+    ComputeDataType x = static_cast<ComputeDataType>(in[i]);
+    ComputeDataType v = x > s ? s : x;
     v = v < -s ? -s : v;
     v = bin_cnt_t * inv_s * v;
-    out[i] = static_cast<T>(
-        round(static_cast<typename QuantizeDataType<T>::type>(v)));
+    out[i] = static_cast<T>(round(v));
   }
 }
 
@@ -237,18 +239,19 @@ __global__ void ClipAndQuantDequantKernel(const T* in, const T* scale,
   int bid = threadIdx.x + blockIdx.x * blockDim.x;
   int tid = threadIdx.x;
 
-  T s = scale[0];
-  T inv_s = inverse(s);
-  T bin_cnt_t = static_cast<T>(bin_cnt);
+  using ComputeDataType = typename QuantizeDataType<T>::type;
+
+  ComputeDataType s = static_cast<ComputeDataType>(scale[0]);
+  ComputeDataType inv_s = inverse(s);
+  ComputeDataType bin_cnt_t = static_cast<ComputeDataType>(bin_cnt);
 
   for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
-    T x = in[i];
+    ComputeDataType x = static_cast<ComputeDataType>(in[i]);
     x = x > s ? s : x;
     x = x < -s ? -s : x;
     x = bin_cnt_t * inv_s * x;
-    x = static_cast<T>(
-        round(static_cast<typename QuantizeDataType<T>::type>(x)));
-    out[i] = (x * s) / bin_cnt_t;
+    x = round(x);
+    out[i] = static_cast<T>((x * s) / bin_cnt_t);
   }
 }
 
@@ -302,17 +305,18 @@ __global__ void ChannelClipAndQuantKernelQuantAxis0(const T* in, const T* scale,
   const T* in_c = in + blockIdx.x * channel_size;
   T* out_c = out + blockIdx.x * channel_size;
 
-  T s = scale[blockIdx.x];
-  T inv_s = inverse(s);
-  T bin_cnt_t = static_cast<T>(bin_cnt);
+  using ComputeDataType = typename QuantizeDataType<T>::type;
+
+  ComputeDataType s = static_cast<ComputeDataType>(scale[blockIdx.x]);
+  ComputeDataType inv_s = inverse(s);
+  ComputeDataType bin_cnt_t = static_cast<ComputeDataType>(bin_cnt);
 
   for (int64_t i = tid; i < channel_size; i += blockDim.x) {
-    T x = in_c[i];
-    T v = x > s ? s : x;
+    ComputeDataType x = static_cast<ComputeDataType>(in_c[i]);
+    ComputeDataType v = x > s ? s : x;
     v = v < -s ? -s : v;
     v = bin_cnt_t * inv_s * v;
-    out_c[i] = static_cast<T>(
-        round(static_cast<typename QuantizeDataType<T>::type>(v)));
+    out_c[i] = static_cast<T>(round(v));
   }
 }
 
@@ -322,16 +326,17 @@ __global__ void ChannelClipAndQuantKernelQuantAxisN(
     const T* in, const T* scale, const int bin_cnt, const int64_t n,
     const int nScale, const int quant_stride, T* out) {
   int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
-  T bin_cnt_t = static_cast<T>(bin_cnt);
+  using ComputeDataType = typename QuantizeDataType<T>::type;
+  ComputeDataType bin_cnt_t = static_cast<ComputeDataType>(bin_cnt);
   for (int64_t i = idx; i < n; i += blockDim.x * gridDim.x) {
-    T s = scale[(i / quant_stride) % nScale];
-    T inv_s = inverse(s);
-    T x = in[i];
-    T v = x > s ? s : x;
+    ComputeDataType s =
+        static_cast<ComputeDataType>(scale[(i / quant_stride) % nScale]);
+    ComputeDataType inv_s = inverse(s);
+    ComputeDataType x = static_cast<ComputeDataType>(in[i]);
+    ComputeDataType v = x > s ? s : x;
     v = v < -s ? -s : v;
     v = bin_cnt_t * inv_s * v;
-    out[i] = static_cast<T>(
-        round(static_cast<typename QuantizeDataType<T>::type>(v)));
+    out[i] = static_cast<T>(round(v));
   }
 }
 
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
old mode 100644
new mode 100755
index 03351dbca09e5..e23891d899de6
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -22,8 +22,10 @@ register_operators(EXCLUDES
     fused_transformer_op
     fused_feedforward_op
     fused_multi_transformer_op
+    fused_bias_dropout_residual_layer_norm_op
     resnet_unit_op
-    fused_gemm_epilogue_op)
+    fused_gemm_epilogue_op
+    fused_gate_attention_op)
 
 # fusion_gru_op does not have CUDA kernel
 op_library(fusion_gru_op)
@@ -58,6 +60,7 @@ if (WITH_GPU OR WITH_ROCM)
     op_library(yolo_box_head_op)
     op_library(yolo_box_post_op)
     op_library(fused_embedding_eltwise_layernorm_op)
+    op_library(fused_gate_attention_op)
     # fusion_group
     if(NOT APPLE AND NOT WIN32)
         op_library(fusion_group_op DEPS device_code)
@@ -79,6 +82,7 @@ if (WITH_GPU OR WITH_ROCM)
         # fused_attention_op
         op_library(fused_attention_op)
         op_library(fused_multi_transformer_op)
+        op_library(fused_bias_dropout_residual_layer_norm_op)
     endif()
     # resnet_unit needs cudnn 8.0 above
     if ((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 8000))
diff --git a/paddle/fluid/operators/fused/attn_gemm.h b/paddle/fluid/operators/fused/attn_gemm.h
index c4e73c6bf97fd..304aad16ad0c6 100644
--- a/paddle/fluid/operators/fused/attn_gemm.h
+++ b/paddle/fluid/operators/fused/attn_gemm.h
@@ -1,8 +1,11 @@
 /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -13,13 +16,17 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
 
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
 
 namespace paddle {
 namespace operators {
+
+using Tensor = framework::Tensor;
 // support gemm-nt and gemm-nn, which is used in fused_attention_op.
 template <typename T>
 class AttnMatMul {
@@ -44,32 +51,21 @@ class AttnMatMul {
                       framework::Tensor* bias_out) {
     // Note: for blas.GEMM API in Paddle, it treats all inputs as row-major.
     // here: (transa, transb): nt, input * weight.
-    CBLAS_TRANSPOSE transA = CblasNoTrans;
-    CBLAS_TRANSPOSE transB = CblasNoTrans;
-    if (transA_) {
-      transA = CblasTrans;
-    }
-    if (transB_) {
-      transB = CblasTrans;
-    }
+    CBLAS_TRANSPOSE transA = transA_ ? CblasTrans : CblasNoTrans;
+    CBLAS_TRANSPOSE transB = transB_ ? CblasTrans : CblasNoTrans;
     T alpha = static_cast<T>(1.0);
     T beta = static_cast<T>(0.0);
 
-    // here: (m, n, k) = bsz_seq, output_size, input_size, (input, weight, out)
+    // (m, n, k) = bsz_seq, output_size, input_size, (input, weight, out)
     auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
     blas.GEMM(transA, transB, bsz_seq_, output_size_, input_size_, alpha,
               input->data<T>(), weight->data<T>(), beta, output->data<T>());
     if (compute_bias_) {
-      // compute output + bias
-      std::vector<const Tensor*> ins;
-      std::vector<Tensor*> outs;
-      ins.emplace_back(output);
-      ins.emplace_back(bias);
-      outs.emplace_back(bias_out);
-      int elewise_add_axis = -1;
-      paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
-                                                     T, T>(
-          dev_ctx_, ins, &outs, elewise_add_axis, AddFunctor<T>());
+      // bias_out = output + bias
+      std::vector<const Tensor*> ins = {output, bias};
+      std::vector<Tensor*> outs = {bias_out};
+      phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
+          dev_ctx_, ins, &outs, -1, phi::funcs::AddFunctor<T>());
     }
   }
 
@@ -77,82 +73,71 @@ class AttnMatMul {
                        const framework::Tensor* weight,
                        const framework::Tensor* d_output,
                        framework::Tensor* d_input, framework::Tensor* d_weight,
-                       framework::Tensor* d_bias) {
+                       framework::Tensor* d_bias, bool use_addto = false) {
     T alpha = static_cast<T>(1.0);
-    T beta = static_cast<T>(0.0);
-    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
-
-    CBLAS_TRANSPOSE dB_transA = CblasNoTrans;
-    CBLAS_TRANSPOSE dB_transB = CblasNoTrans;
-    CBLAS_TRANSPOSE dA_transA = CblasNoTrans;
-    CBLAS_TRANSPOSE dA_transB = CblasNoTrans;
-    int dB_m = 1;
-    int dB_n = 1;
-    int dB_k = 1;
-    int dA_m = 1;
-    int dA_n = 1;
-    int dA_k = 1;
-
-    T* dB_input_1_ptr = nullptr;
-    T* dB_input_2_ptr = nullptr;
-    T* dB_output_ptr = d_weight->data<T>();
-
-    T* dA_input_1_ptr = nullptr;
-    T* dA_input_2_ptr = nullptr;
-    T* dA_output_ptr = d_input->data<T>();
+    T beta_dA = use_addto ? static_cast<T>(1.0) : static_cast<T>(0.0);
+    T beta_dB = static_cast<T>(0.0);
 
+    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
     if (!transA_) {
-      // fw: gemm-nt
+      // forward: gemm-nt
       if (transB_) {
-        // bw: gemm-tn, dB = (dC)^t * A
-        dB_transA = CblasTrans;
-        dB_transB = CblasNoTrans;
-        dB_m = output_size_;
-        dB_n = input_size_;
-        dB_k = bsz_seq_;
-
-        // bw: gemm-nn, dA = dC * B
-        dA_transA = CblasNoTrans;
-        dA_transB = CblasNoTrans;
-        dA_m = bsz_seq_;
-        dA_n = input_size_;
-        dA_k = output_size_;
-
-        blas.GEMM(dB_transA, dB_transB, dB_m, dB_n, dB_k, alpha,
-                  d_output->data<T>(), input->data<T>(), beta, dB_output_ptr);
-        blas.GEMM(dA_transA, dA_transB, dA_m, dA_n, dA_k, alpha,
-                  d_output->data<T>(), weight->data<T>(), beta, dA_output_ptr);
+        // backward: gemm-tn, dB = (dC)^T * A
+        if (d_weight) {
+          int dB_m = output_size_;
+          int dB_n = input_size_;
+          int dB_k = bsz_seq_;
+
+          T* dB_output_ptr = d_weight->data<T>();
+          blas.GEMM(CblasTrans, CblasNoTrans, dB_m, dB_n, dB_k, alpha,
+                    d_output->data<T>(), input->data<T>(), beta_dB,
+                    dB_output_ptr);
+        }
+
+        // backward: gemm-nn, dA = dC * B
+        if (d_input) {
+          int dA_m = bsz_seq_;
+          int dA_n = input_size_;
+          int dA_k = output_size_;
+
+          T* dA_output_ptr = d_input->data<T>();
+          blas.GEMM(CblasNoTrans, CblasNoTrans, dA_m, dA_n, dA_k, alpha,
+                    d_output->data<T>(), weight->data<T>(), beta_dA,
+                    dA_output_ptr);
+        }
       } else {  // fw: gemm-nn
-        // bw: gemm-tn, dB = A^t * dC
-        dB_transA = CblasTrans;
-        dB_transB = CblasNoTrans;
-        dB_m = input_size_;
-        dB_n = output_size_;
-        dB_k = bsz_seq_;
-
-        // bw: gemm-nt, dA = dC * B^t
-        dA_transA = CblasNoTrans;
-        dA_transB = CblasTrans;
-        dA_m = bsz_seq_;
-        dA_n = input_size_;
-        dA_k = output_size_;
-
-        blas.GEMM(dB_transA, dB_transB, dB_m, dB_n, dB_k, alpha,
-                  input->data<T>(), d_output->data<T>(), beta, dB_output_ptr);
-        blas.GEMM(dA_transA, dA_transB, dA_m, dA_n, dA_k, alpha,
-                  d_output->data<T>(), weight->data<T>(), beta, dA_output_ptr);
+        // backward: gemm-tn, dB = A^T * dC
+        if (d_weight) {
+          int dB_m = input_size_;
+          int dB_n = output_size_;
+          int dB_k = bsz_seq_;
+
+          T* dB_output_ptr = d_weight->data<T>();
+          blas.GEMM(CblasTrans, CblasNoTrans, dB_m, dB_n, dB_k, alpha,
+                    input->data<T>(), d_output->data<T>(), beta_dB,
+                    dB_output_ptr);
+        }
+
+        // backward: gemm-nt, dA = dC * B^T
+        if (d_input) {
+          int dA_m = bsz_seq_;
+          int dA_n = input_size_;
+          int dA_k = output_size_;
+
+          T* dA_output_ptr = d_input->data<T>();
+          blas.GEMM(CblasNoTrans, CblasTrans, dA_m, dA_n, dA_k, alpha,
+                    d_output->data<T>(), weight->data<T>(), beta_dA,
+                    dA_output_ptr);
+        }
       }
-    } else if (transB_) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "AttnMatMul wrapper do not support (transA=T, transB=T)"
-          "parameters."));
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
-          "AttnMatMul wrapper do not support (transA=T, transB=N)"
+          "AttnMatMul wrapper do not support (transA=T, transB=T/N)"
           "parameters."));
     }
-    if (compute_bias_) {
-      // reduce: {0, 1, 2, 3, 4} -> {2, 3, 4} or {0, 1, 2} -> {2}
+    if (compute_bias_ && d_bias) {
+      // reduce: {0, 1, 2, 3, 4} -> {2, 3, 4} or {0, 1, 2} -> {2} or {0,1,2,3}
+      // -> {3} or {0,1,2,3,4} -> {3,4}
       const auto input_dims = d_output->dims();
       const auto output_dims = d_bias->dims();
       bool support_case_1 =
@@ -163,11 +148,22 @@ class AttnMatMul {
       bool support_case_2 =
           (input_dims.size() == 3 && output_dims.size() == 1 &&
            (input_dims[2] == output_dims[0]));
+      bool support_case_3 =
+          (input_dims.size() == 4 && output_dims.size() == 1 &&
+           input_dims[3] == output_dims[0]);
+      bool support_case_4 =
+          (input_dims.size() == 5 && output_dims.size() == 2 &&
+           input_dims[3] == output_dims[0] && input_dims[4] == output_dims[1]);
+
+      gpuStream_t stream = dev_ctx_.stream();
       if (support_case_1 || support_case_2) {
-        gpuStream_t stream = dev_ctx_.stream();
         TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
             dev_ctx_, *d_output, d_bias, kps::IdentityFunctor<T>(), {0, 1},
             stream);
+      } else if (support_case_3 || support_case_4) {
+        TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+            dev_ctx_, *d_output, d_bias, kps::IdentityFunctor<T>(), {0, 1, 2},
+            stream);
       } else {
         PADDLE_THROW(platform::errors::InvalidArgument(
             "Only support reduce when the input dims are [0,1,2,3,4] and "
diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
index c5adee547bdac..516b10fa021c1 100644
--- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
@@ -103,7 +103,7 @@ void ComputeSumAndSquareSum(const framework::Tensor &cpu_x,
                             framework::Tensor *cpu_sum,
                             framework::Tensor *cpu_sum_of_square) {
   // x is in NHWC format.
-  auto dims = cpu_x.dims();
+  const auto &dims = cpu_x.dims();
   int64_t c = dims[3];
 
   const T *cpu_x_ptr = cpu_x.data<T>();
diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
index 884fca2c1b0b8..5881322007add 100644
--- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
@@ -51,7 +51,7 @@ void InitRandomTensor(const std::vector<int64_t> &dims,
 template <typename T>
 void TransposeNchwToNhwc(const framework::Tensor &cpu_in,
                          framework::Tensor *cpu_out) {
-  auto in_dims = cpu_in.dims();
+  const auto &in_dims = cpu_in.dims();
   EXPECT_EQ(cpu_in.dims().size(), 4);
 
   const T *cpu_in_ptr = cpu_in.data<T>();
@@ -184,7 +184,7 @@ template <typename T>
 void ComputeSumAndSquareSum(const framework::Tensor &cpu_out,
                             framework::Tensor *cpu_sum,
                             framework::Tensor *cpu_sum_of_square) {
-  auto dims = cpu_out.dims();
+  const auto &dims = cpu_out.dims();
   int64_t c = dims[3];
 
   const T *cpu_out_ptr = cpu_out.data<T>();
diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h
index 3d75d127ab60a..38f9aff226ea9 100644
--- a/paddle/fluid/operators/fused/fmha_ref.h
+++ b/paddle/fluid/operators/fused/fmha_ref.h
@@ -1,8 +1,11 @@
 /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -12,12 +15,12 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/operators/dropout_impl.cu.h"
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/fused/fused_softmax_mask.cu.h"
 #include "paddle/fluid/operators/transpose_op.cu.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/funcs/functors.h"
 #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 
@@ -160,9 +163,9 @@ class FMHARef {
         ins.emplace_back(src_mask_tensor);
         outs.emplace_back(src_mask_out_tensor);
         int elewise_add_axis = -1;
-        paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
-                                                       T, T>(
-            dev_ctx_, ins, &outs, elewise_add_axis, AddFunctor<T>());
+        phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
+            dev_ctx_, ins, &outs, elewise_add_axis,
+            phi::funcs::AddFunctor<T>());
 
         phi::SoftmaxForwardCUDAKernelDriver<T>(
             dev_ctx_, *src_mask_out_tensor, softmax_axis, softmax_out_tensor);
@@ -297,7 +300,6 @@ class FMHARef {
       phi::SoftmaxBackwardCUDAKernelDriver<T>(
           dev_ctx_, softmax_out_tensor, *softmax_out_grad_tensor, softmax_axis,
           src_mask_out_grad_tensor);
-
       // recall LaunchElementwiseCudaKernel fw:  src_mask_out = qk_out +
       // src_mask
       // Special case when dy is not needed and dx doesn't reduce
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cc b/paddle/fluid/operators/fused/fused_attention_op.cc
index 1f377810a2287..a1adec9641a6e 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cc
+++ b/paddle/fluid/operators/fused/fused_attention_op.cc
@@ -194,7 +194,7 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
     // the same as QKOut's shape.
     ctx->SetOutputDim("AttnDropoutOut",
                       {x_dim[0], y_dim[1], x_dim[1], out_seq_len});
-    if (ctx->Attrs().Get<bool>("attn_dropout_is_test") == false) {
+    if (ctx->Attrs().Get<bool>("is_test") == false) {
       ctx->SetOutputDim("AttnDropoutMaskOut",
                         {x_dim[0], y_dim[1], x_dim[1], out_seq_len});
     }
@@ -206,7 +206,7 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("FMHAOut", {x_dim[0], x_dim[1], y_dim[1], y_dim[2]});
     ctx->SetOutputDim("OutLinearOut", ctx->GetInputDim("X"));
 
-    if (ctx->Attrs().Get<bool>("dropout_is_test") == false) {
+    if (ctx->Attrs().Get<bool>("is_test") == false) {
       ctx->SetOutputDim("DropoutMaskOut", ctx->GetInputDim("X"));
     }
 
@@ -301,7 +301,7 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
               platform::errors::InvalidArgument(
                   "'attn_dropout_rate' must be between 0.0 and 1.0."));
         });
-    AddAttr<bool>("attn_dropout_is_test",
+    AddAttr<bool>("is_test",
                   "(bool, default false) Set to true for inference only, false "
                   "for training. Some layers may run faster when this is true.")
         .SetDefault(false);
@@ -345,11 +345,6 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
                             platform::errors::InvalidArgument(
                                 "'dropout_rate' must be between 0.0 and 1.0."));
         });
-
-    AddAttr<bool>("dropout_is_test",
-                  "(bool, default false) Set to true for inference only, false "
-                  "for training. Some layers may run faster when this is true.")
-        .SetDefault(false);
     AddAttr<bool>("dropout_fix_seed",
                   "A flag indicating whether to use a fixed seed to generate "
                   "random mask. NOTE: DO NOT set this flag to true in "
@@ -418,10 +413,9 @@ class FusedAttentionGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->Attrs().Get<bool>("attn_dropout_is_test"), false,
-        platform::errors::InvalidArgument(
-            "GradOp is only callable when attn_dropout_is_test is false"));
+    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("is_test"), false,
+                      platform::errors::InvalidArgument(
+                          "GradOp is only callable when is_test is false"));
 
     if (ctx->Attrs().Get<bool>("pre_layer_norm") == false) {
       OP_INOUT_CHECK(ctx->HasInput("Ln2Mean"), "Input", "Ln2Mean",
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu
index d26577f06fe68..f25bd53992894 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_attention_op.cu
@@ -19,7 +19,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/operators/fused/attention_layer_norm.h"
@@ -108,7 +109,7 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
     const float ln_epsilon = ctx.Attr<float>("ln_epsilon");
 
     float attn_dropout_rate = ctx.Attr<float>("attn_dropout_rate");
-    bool is_test_1 = ctx.Attr<bool>("attn_dropout_is_test");
+    bool is_test_1 = ctx.Attr<bool>("is_test");
     auto &dropout_implementation_1 =
         ctx.Attr<std::string>("attn_dropout_implementation");
     bool is_upscale_in_train_1 =
@@ -279,7 +280,7 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
     const float ln2epsilon = ctx.Attr<float>("ln_epsilon");
 
     float attn_dropout_prob = ctx.Attr<float>("attn_dropout_rate");
-    bool is_test_1 = ctx.Attr<bool>("attn_dropout_is_test");
+    bool is_test_1 = ctx.Attr<bool>("is_test");
     auto &dropout_implementation_1 =
         ctx.Attr<std::string>("attn_dropout_implementation");
     bool is_upscale_in_train_1 =
@@ -543,10 +544,9 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
     ins.emplace_back(d_x);
     outs.emplace_back(d_x);
     int elewise_add_axis = -1;
-    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
-                                                   T>(
+    phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
         ctx.cuda_device_context(), ins, &outs, elewise_add_axis,
-        AddFunctor<T>());
+        phi::funcs::AddFunctor<T>());
   }
 };
 
diff --git a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
new file mode 100644
index 0000000000000..781f51d70ec66
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
@@ -0,0 +1,239 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class FusedBiasDropoutResidualLnOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X",
+                   "FusedBiasDropoutResidualLnOp");
+    OP_INOUT_CHECK(ctx->HasOutput("LnMean"), "Output", "LnMean",
+                   "FusedBiasDropoutResidualLnOp");
+    OP_INOUT_CHECK(ctx->HasOutput("LnVariance"), "Output", "LnVariance",
+                   "FusedBiasDropoutResidualLnOp");
+    OP_INOUT_CHECK(ctx->HasOutput("BiasDropoutResidualOut"), "Output",
+                   "BiasDropoutResidualOut", "FusedBiasDropoutResidualLnOp");
+    OP_INOUT_CHECK(ctx->HasOutput("DropoutMaskOut"), "Output", "DropoutMaskOut",
+                   "FusedBiasDropoutResidualLnOp");
+    OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y",
+                   "FusedBiasDropoutResidualLnOp");
+    auto x_dim = ctx->GetInputDim("X");
+    int left = 1;
+    for (int i = 0; i < x_dim.size() - 1; i++) {
+      left *= x_dim[i];
+    }
+    ctx->SetOutputDim("BiasDropoutResidualOut", ctx->GetInputDim("X"));
+    if (ctx->Attrs().Get<bool>("is_test") == false) {
+      ctx->SetOutputDim("DropoutMaskOut", ctx->GetInputDim("X"));
+    }
+    ctx->SetOutputDim("LnMean", {left});
+    ctx->SetOutputDim("LnVariance", {left});
+    ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input = ctx.Input<Tensor>("X");
+    auto input_data_type = framework::TransToProtoVarType(input->dtype());
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
+};
+
+class FusedBiasDropoutResidualLnOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input tensor.");
+    AddInput("Residual", "The residual tensor.");
+    AddInput("Bias", "The linear bias tensor.").AsDispensable();
+    AddInput("LnScale",
+             "(optional) Scale is a 1-dimensional tensor of size "
+             "H. Here, H represents the last dimension of its input tensor.")
+        .AsDispensable();
+    AddInput("LnBias",
+             "(optional) Bias is a 1-dimensional tensor of size "
+             "H. Here, H represents the last dimension of its input tensor.")
+        .AsDispensable();
+    AddOutput("BiasDropoutResidualOut", "Output of bias + dropout + residual.")
+        .AsIntermediate();
+    AddOutput("DropoutMaskOut", "The random sampled dropout mask.")
+        .AsIntermediate();
+    AddOutput("LnMean", "Mean of the current mini batch.").AsIntermediate();
+    AddOutput("LnVariance", "Variance of the current mini batch.")
+        .AsIntermediate();
+    AddOutput("Y", "Result.");
+    AddAttr<float>("dropout_rate", "Probability of setting units to zero.")
+        .SetDefault(.5f)
+        .AddCustomChecker([](const float &drop_p) {
+          PADDLE_ENFORCE_EQ(drop_p >= 0.0f && drop_p <= 1.0f, true,
+                            platform::errors::InvalidArgument(
+                                "'dropout_rate' must be between 0.0 and 1.0."));
+        });
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
+    AddAttr<bool>("dropout_fix_seed",
+                  "A flag indicating whether to use a fixed seed to generate "
+                  "random mask. NOTE: DO NOT set this flag to true in "
+                  "training. Setting this flag to true is only useful in "
+                  "unittest or for debug that always the same output units "
+                  "will be dropped.")
+        .SetDefault(true);
+    AddAttr<int>("dropout_seed", "Dropout random seed.").SetDefault(0);
+    AddAttr<std::string>(
+        "dropout_implementation",
+        "[\"downgrade_in_infer\"|\"upscale_in_train\"]"
+        "The meaning is the same as 'attn_dropout_implementation'.")
+        .SetDefault("downgrade_in_infer")
+        .AddCustomChecker([](const std::string &type) {
+          PADDLE_ENFORCE_EQ(
+              type == "downgrade_in_infer" || type == "upscale_in_train", true,
+              platform::errors::InvalidArgument(
+                  "dropout_implementation can only be downgrade_in_infer or "
+                  "upscale_in_train"));
+        });
+    AddAttr<float>("ln_epsilon",
+                   "Constant for numerical stability [default 1e-5].")
+        .SetDefault(1e-5)
+        .AddCustomChecker([](const float &ln_epsilon) {
+          PADDLE_ENFORCE_EQ(ln_epsilon >= 0.0f && ln_epsilon <= 0.001f, true,
+                            platform::errors::InvalidArgument(
+                                "'epsilon' of the LayerNorm should be between "
+                                "0.0 and 0.001, But received [%s].",
+                                ln_epsilon));
+        });
+
+    AddComment(R"DOC(
+    Add fused bias_dropout_residual_layer_norm op whose logic is as follows:
+    // @input: [batch_size, seq_len, embed_dim] 
+    // @final_out: [batch_size, seq_len, embed_dim] 
+    y = layer_norm(residual + dropout(bias + x));
+    )DOC");
+  }
+};
+
+class FusedBiasDropoutResidualLnGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("is_test"), false,
+                      platform::errors::InvalidArgument(
+                          "GradOp is only callable when is_test is false"));
+    OP_INOUT_CHECK(ctx->HasInput("LnMean"), "Input", "LnMean",
+                   "FusedBiasDropoutResidualLnGrad");
+    OP_INOUT_CHECK(ctx->HasInput("LnVariance"), "Input", "LnVariance",
+                   "FusedBiasDropoutResidualLnGrad");
+    OP_INOUT_CHECK(ctx->HasInput("BiasDropoutResidualOut"), "Input",
+                   "BiasDropoutResidualOut", "FusedBiasDropoutResidualLnGrad");
+    if (ctx->HasOutput(framework::GradVarName("LnScale"))) {
+      ctx->SetOutputDim(framework::GradVarName("LnScale"),
+                        ctx->GetInputDim("LnScale"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("LnBias"))) {
+      ctx->SetOutputDim(framework::GradVarName("LnBias"),
+                        ctx->GetInputDim("LnBias"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Residual"))) {
+      ctx->SetOutputDim(framework::GradVarName("Residual"),
+                        ctx->GetInputDim("Residual"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Bias"))) {
+      ctx->SetOutputDim(framework::GradVarName("Bias"),
+                        ctx->GetInputDim("Bias"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    }
+    ctx->SetOutputDim(framework::GradVarName("BiasDropoutResidualOut"),
+                      ctx->GetInputDim("BiasDropoutResidualOut"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input = ctx.Input<Tensor>("X");
+    auto input_data_type = framework::TransToProtoVarType(input->dtype());
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class FusedBiasDropoutResidualLnGradOpMaker
+    : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("fused_bias_dropout_residual_layer_norm_grad");
+    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Residual", this->Input("Residual"));
+    if (this->HasInput("Bias")) {
+      op->SetInput("Bias", this->Input("Bias"));
+      op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias"));
+    }
+    if (this->HasInput("LnScale")) {
+      op->SetInput("LnScale", this->Input("LnScale"));
+      op->SetOutput(framework::GradVarName("LnScale"),
+                    this->InputGrad("LnScale"));
+    }
+    if (this->HasInput("LnBias")) {
+      op->SetInput("LnBias", this->Input("LnBias"));
+      op->SetOutput(framework::GradVarName("LnBias"),
+                    this->InputGrad("LnBias"));
+    }
+    if (this->HasOutput("LnMean")) {
+      op->SetInput("LnMean", this->Output("LnMean"));
+    }
+    if (this->HasOutput("LnVariance")) {
+      op->SetInput("LnVariance", this->Output("LnVariance"));
+    }
+    if (this->HasOutput("BiasDropoutResidualOut")) {
+      op->SetInput("BiasDropoutResidualOut",
+                   this->Output("BiasDropoutResidualOut"));
+    }
+    op->SetInput("DropoutMaskOut", this->Output("DropoutMaskOut"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Residual"),
+                  this->InputGrad("Residual"));
+    op->SetOutput(framework::GradVarName("BiasDropoutResidualOut"),
+                  this->OutputGrad("BiasDropoutResidualOut"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    fused_bias_dropout_residual_layer_norm, ops::FusedBiasDropoutResidualLnOp,
+    ops::FusedBiasDropoutResidualLnOpMaker,
+    ops::FusedBiasDropoutResidualLnGradOpMaker<paddle::framework::OpDesc>,
+    ops::FusedBiasDropoutResidualLnGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(fused_bias_dropout_residual_layer_norm_grad,
+                  ops::FusedBiasDropoutResidualLnGradOp);
diff --git a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
new file mode 100644
index 0000000000000..71a2c9728cc6b
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
@@ -0,0 +1,148 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cuda_fp16.h>
+#include <cub/cub.cuh>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/fused/fused_dropout_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class FusedBiasDropoutResidualLnOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    using U = LayerNormParamType<T>;
+    auto *input_x = ctx.Input<Tensor>("X");
+    auto *bias = ctx.Input<Tensor>("Bias");
+    auto *residual = ctx.Input<Tensor>("Residual");
+    const float ln_epsilon = ctx.Attr<float>("ln_epsilon");
+    auto *ln_scale = ctx.Input<Tensor>("LnScale");
+    auto *ln_bias = ctx.Input<Tensor>("LnBias");
+    auto *dropout_mask_out = ctx.Output<Tensor>("DropoutMaskOut");
+    auto *bias_dropout_residual_out =
+        ctx.Output<Tensor>("BiasDropoutResidualOut");
+    auto *ln_mean = ctx.Output<Tensor>("LnMean");
+    auto *ln_var = ctx.Output<Tensor>("LnVariance");
+    auto *y = ctx.Output<Tensor>("Y");
+    auto *x_data = input_x->data<T>();
+    auto *bias_data = (bias == nullptr) ? nullptr : bias->data<T>();
+    auto *residual_data = (residual == nullptr) ? nullptr : residual->data<T>();
+    auto *ln_scale_data = (ln_scale == nullptr ? nullptr : ln_scale->data<U>());
+    auto *ln_bias_data = (ln_bias == nullptr ? nullptr : ln_bias->data<U>());
+    auto *bias_dropout_residual_out_data =
+        bias_dropout_residual_out->mutable_data<T>(ctx.GetPlace());
+    auto *ln_mean_data = ln_mean->mutable_data<U>(ctx.GetPlace());
+    auto *ln_var_data = ln_var->mutable_data<U>(ctx.GetPlace());
+    auto *dropout_mask_out_data =
+        dropout_mask_out->mutable_data<uint8_t>(ctx.GetPlace());
+    auto *y_data = y->mutable_data<T>(ctx.GetPlace());
+
+    const auto input_x_dims = input_x->dims();
+    int bsz_seq = 1;
+    for (int i = 0; i < input_x_dims.size() - 1; i++) {
+      bsz_seq *= input_x_dims[i];
+    }
+    int dim_embed = input_x_dims[input_x_dims.size() - 1];
+    DropoutParam dropout_param(ctx, 0);
+    FusedDropoutLayerNormHelper<T, uint8_t> fused_dropout_layernorm_helper(
+        ctx.cuda_device_context(), bsz_seq, dim_embed, dropout_param,
+        ln_epsilon);
+    // output = layernorm(residual + dropout(input + bias))
+    fused_dropout_layernorm_helper.LayernormResidualDropoutBias(
+        ctx.cuda_device_context(), x_data, residual_data, bias_data,
+        ln_scale_data, ln_bias_data, bias_dropout_residual_out_data,
+        dropout_mask_out_data, y_data, ln_mean_data, ln_var_data);
+  }
+};
+
+template <typename T>
+class FusedBiasDropoutResidualLnGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    using U = LayerNormParamType<T>;
+    const float ln_epsilon = ctx.Attr<float>("ln_epsilon");
+
+    auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto *ln_scale = ctx.Input<Tensor>("LnScale");
+    auto *dropout_mask_out = ctx.Input<Tensor>("DropoutMaskOut");
+    auto *bias_dropout_residual_out =
+        ctx.Input<Tensor>("BiasDropoutResidualOut");
+    auto *ln_mean = ctx.Input<Tensor>("LnMean");
+    auto *ln_var = ctx.Input<Tensor>("LnVariance");
+    auto *d_y_data = d_y->data<T>();
+    auto *ln_scale_data = (ln_scale == nullptr ? nullptr : ln_scale->data<U>());
+    auto *dropout_mask_out_data = dropout_mask_out->data<uint8_t>();
+    auto *bias_dropout_residual_out_data = bias_dropout_residual_out->data<T>();
+    auto *ln_mean_data = ln_mean->data<U>();
+    auto *ln_var_data = ln_var->data<U>();
+
+    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_residual = ctx.Output<Tensor>(framework::GradVarName("Residual"));
+    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+    auto *d_bias_dropout_residual_out =
+        ctx.Output<Tensor>(framework::GradVarName("BiasDropoutResidualOut"));
+    auto *d_ln_scale = ctx.Output<Tensor>(framework::GradVarName("LnScale"));
+    auto *d_ln_bias = ctx.Output<Tensor>(framework::GradVarName("LnBias"));
+    auto *d_x_data = d_x->mutable_data<T>(ctx.GetPlace());
+    auto *d_residual_data = d_residual->mutable_data<T>(ctx.GetPlace());
+    auto *d_bias_dropout_residual_out_data =
+        d_bias_dropout_residual_out->mutable_data<T>(ctx.GetPlace());
+    auto *d_bias_data =
+        (d_bias == nullptr ? nullptr : d_bias->mutable_data<T>(ctx.GetPlace()));
+    auto *d_ln_scale_data =
+        (d_ln_scale == nullptr ? nullptr
+                               : d_ln_scale->mutable_data<U>(ctx.GetPlace()));
+    auto *d_ln_bias_data =
+        (d_ln_bias == nullptr ? nullptr
+                              : d_ln_bias->mutable_data<U>(ctx.GetPlace()));
+
+    const auto input_x_dims = d_y->dims();
+    int bsz_seq = 1;
+    for (int i = 0; i < input_x_dims.size() - 1; i++) {
+      bsz_seq *= input_x_dims[i];
+    }
+    int dim_embed = input_x_dims[input_x_dims.size() - 1];
+    DropoutParam dropout_param(ctx, 0);
+    FusedDropoutLayerNormHelper<T, uint8_t> fused_dropout_layernorm_helper(
+        ctx.cuda_device_context(), bsz_seq, dim_embed, dropout_param,
+        ln_epsilon);
+    fused_dropout_layernorm_helper.LayernormResidualDropoutBiasGrad(
+        ctx.cuda_device_context(), d_y_data, bias_dropout_residual_out_data,
+        dropout_mask_out_data, ln_scale_data, ln_mean_data, ln_var_data,
+        d_bias_dropout_residual_out_data, d_ln_scale_data, d_ln_bias_data,
+        d_x_data, d_bias_data, d_residual_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(fused_bias_dropout_residual_layer_norm,
+                        ops::FusedBiasDropoutResidualLnOpKernel<float>,
+                        ops::FusedBiasDropoutResidualLnOpKernel<double>,
+                        ops::FusedBiasDropoutResidualLnOpKernel<plat::float16>);
+REGISTER_OP_CUDA_KERNEL(
+    fused_bias_dropout_residual_layer_norm_grad,
+    ops::FusedBiasDropoutResidualLnGradKernel<float>,
+    ops::FusedBiasDropoutResidualLnGradKernel<double>,
+    ops::FusedBiasDropoutResidualLnGradKernel<plat::float16>);
diff --git a/paddle/fluid/operators/fused/fused_dropout_helper.h b/paddle/fluid/operators/fused/fused_dropout_helper.h
index 0a33a60f8123d..c352f08ec2ba7 100644
--- a/paddle/fluid/operators/fused/fused_dropout_helper.h
+++ b/paddle/fluid/operators/fused/fused_dropout_helper.h
@@ -82,7 +82,7 @@ struct DropoutParam {
     auto& dropout_implementation =
         context.Attr<std::string>(pre_fix + "implementation");
     is_upscale_in_train = (dropout_implementation == "upscale_in_train");
-    is_test = context.Attr<bool>(pre_fix + "is_test");
+    is_test = context.Attr<bool>("is_test");
     fix_seed = context.Attr<bool>(pre_fix + "fix_seed");
 
     std::string str_seed = "Dropout";
diff --git a/paddle/fluid/operators/fused/fused_dropout_test.h b/paddle/fluid/operators/fused/fused_dropout_test.h
index a9b72a9cdf397..8527610247b05 100644
--- a/paddle/fluid/operators/fused/fused_dropout_test.h
+++ b/paddle/fluid/operators/fused/fused_dropout_test.h
@@ -138,7 +138,7 @@ void LayerNorm(const std::vector<LayerNormParamType<T>> &scale,
                const platform::CUDADeviceContext &ctx) {
   framework::Scope scope;
   auto place = ctx.GetPlace();
-  paddle::optional<const framework::LoDTensor &> scale_opt = paddle::none;
+  paddle::optional<framework::LoDTensor> scale_opt;
   if (scale.size() > 0) {
     auto var_scale = scope.Var("Scale");
     auto tensor_scale = var_scale->GetMutable<framework::LoDTensor>();
@@ -147,7 +147,7 @@ void LayerNorm(const std::vector<LayerNormParamType<T>> &scale,
     scale_opt = *tensor_scale;
   }
 
-  paddle::optional<const framework::LoDTensor &> bias_opt = paddle::none;
+  paddle::optional<framework::LoDTensor> bias_opt;
   if (bias.size() > 0) {
     auto var_bias = scope.Var("Bias");
     auto tensor_bias = var_bias->GetMutable<framework::LoDTensor>();
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cc b/paddle/fluid/operators/fused/fused_feedforward_op.cc
index f3f8f17427577..8e15232acda90 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cc
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cc
@@ -61,14 +61,14 @@ class FusedFeedForwardOp : public framework::OperatorWithKernel {
     tmp_dim_x[dim_x.size() - 1] =
         dim_Linear1Weight[dim_Linear1Weight.size() - 1];
     context->SetOutputDim("Out", dim_x);
-    if (context->Attrs().Get<bool>("dropout1_is_test") == false) {
+    if (context->Attrs().Get<bool>("is_test") == false) {
       context->SetOutputDim("Dropout1Mask", tmp_dim_x);
     }
     context->SetOutputDim("Dropout1Out", tmp_dim_x);
     context->SetOutputDim("Linear1Out", tmp_dim_x);
     context->SetOutputDim("Dropout2Out", dim_x);
 
-    if (context->Attrs().Get<bool>("dropout2_is_test") == false) {
+    if (context->Attrs().Get<bool>("is_test") == false) {
       context->SetOutputDim("Dropout2Mask", dim_x);
     }
     framework::DDim mean_dim =
@@ -185,9 +185,7 @@ class FusedFeedForwardOpMaker : public framework::OpProtoAndCheckerMaker {
                   "dropout2_implementation can only be downgrade_in_infer or "
                   "upscale_in_train"));
         });
-    AddAttr<bool>("dropout1_is_test", "the is_test of first dropout")
-        .SetDefault(false);
-    AddAttr<bool>("dropout2_is_test", "the is_test of second dropout")
+    AddAttr<bool>("is_test", "the is_test attribute of dropout")
         .SetDefault(false);
     AddAttr<bool>("dropout1_fix_seed", "the is_test of first dropout")
         .SetDefault(false);
@@ -218,10 +216,7 @@ class FusedFeedForwardOpGrad : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("dropout1_is_test"), false,
-                      platform::errors::InvalidArgument(
-                          "GradOp is only callable when is_test is false"));
-    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("dropout2_is_test"), false,
+    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("is_test"), false,
                       platform::errors::InvalidArgument(
                           "GradOp is only callable when is_test is false"));
     bool pre_layer_norm = ctx->Attrs().Get<bool>("pre_layer_norm");
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu
index c38d9f7d4bcbd..2eb9885286dab 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cu
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu
@@ -17,9 +17,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/matmul_v2_op.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/fluid/operators/fused/fused_dropout_helper.h"
 #include "paddle/fluid/operators/layer_norm_kernel.cu.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -345,9 +346,8 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
     ins[1] = d_x;
     outs[0] = d_x;
     int elewise_add_axis = -1;
-    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
-                                                   T>(
-        ctx, ins, &outs, elewise_add_axis, AddFunctor<T>());
+    phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
+        ctx, ins, &outs, elewise_add_axis, phi::funcs::AddFunctor<T>());
   }
 
   void Compute(const framework::ExecutionContext& context) const override {
diff --git a/paddle/fluid/operators/fused/fused_gate_attention.h b/paddle/fluid/operators/fused/fused_gate_attention.h
new file mode 100644
index 0000000000000..cda33987d68ac
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_gate_attention.h
@@ -0,0 +1,647 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
+#include "paddle/fluid/operators/transpose_op.cu.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+inline std::string MemoryDebugString(const Tensor& t) {
+  std::stringstream ss;
+  ss << "shape=[" << t.dims()
+     << "], size=" << static_cast<float>(t.memory_size()) / (1 << 20)
+     << " MB, ptr=" << t.data();
+
+  size_t total = 0;
+  size_t available = 0;
+  platform::GpuMemoryUsage(&available, &total);
+  ss << "; memory allocated="
+     << static_cast<float>(total - available) / (1 << 20) << " MB";
+  return ss.str();
+}
+
+template <typename T>
+struct TernaryAddFunctor {
+  inline HOSTDEVICE T operator()(T a, T b, T c) const { return a + b + c; }
+};
+
+template <typename T>
+struct GateAttentionConfig {
+ public:
+  int64_t batch_size;
+  int64_t seq_len_m;
+  int64_t seq_len_r;
+  int64_t q_dim;
+  int64_t kv_dim;
+  int64_t key_dim;
+  int64_t m_size;
+  int64_t num_heads;
+
+  phi::DDim qkv_out_dims;
+  phi::DDim qkv_transpose_out_dims;
+
+  phi::DDim q_out_dims;
+  phi::DDim kv_out_dims;
+  phi::DDim q_transpose_out_dims;
+  phi::DDim kv_transpose_out_dims;
+
+  phi::DDim qk_out_dims;
+  phi::DDim softmax_out_dims;
+  phi::DDim qktv_out_dims;
+  phi::DDim gate_out_dims;
+
+  GateAttentionConfig(const Tensor* query, const Tensor* key,
+                      const Tensor* query_weight, const Tensor* qkv_weight,
+                      bool merge_qkv) {
+    // query: shape=[batch_size, seq_len_m, seq_len_r, q_dim]
+    batch_size = query->dims()[0];
+    seq_len_m = query->dims()[1];
+    seq_len_r = query->dims()[2];
+    q_dim = query->dims()[3];
+
+    if (merge_qkv) {
+      PADDLE_ENFORCE_NOT_NULL(
+          qkv_weight,
+          platform::errors::NotFound("The input qkv_weight can not be nullptr "
+                                     "when merge_qkv is true."));
+
+      // When q_dim == kv_dim, QKV matmul can be computed merged.
+      // qkv_weight: shape=[3, num_heads, key_dim, q_dim]
+      num_heads = qkv_weight->dims()[1];
+      key_dim = qkv_weight->dims()[2];
+      m_size = seq_len_r;
+      kv_dim = q_dim;
+
+      qkv_out_dims = {batch_size, seq_len_m, seq_len_r, 3, num_heads, key_dim};
+      qkv_transpose_out_dims = {3,         batch_size, seq_len_m,
+                                num_heads, seq_len_r,  key_dim};
+    } else {
+      PADDLE_ENFORCE_NOT_NULL(
+          key,
+          platform::errors::NotFound(
+              "The input key can not be nullptr when merge_qkv is false."));
+      PADDLE_ENFORCE_NOT_NULL(
+          query_weight,
+          platform::errors::NotFound("The input query_weight can not be "
+                                     "nullptr when merge_qkv is false."));
+
+      // When q_dim != kv_dim, QKV matmul must be computed saparately.
+      // key: shape=[batch_size, seq_len_m, m_size, kv_dim]
+      // query_w: shape=[q_dim, num_heads, key_dim]
+      num_heads = query_weight->dims()[1];
+      key_dim = query_weight->dims()[2];
+      m_size = key->dims()[2];
+      kv_dim = key->dims()[3];
+
+      q_out_dims = {batch_size, seq_len_m, seq_len_r, num_heads, key_dim};
+      kv_out_dims = {batch_size, seq_len_m, m_size, num_heads, key_dim};
+      q_transpose_out_dims = {batch_size, seq_len_m, num_heads, seq_len_r,
+                              key_dim};
+      kv_transpose_out_dims = {batch_size, seq_len_m, num_heads, m_size,
+                               key_dim};
+    }
+
+    qk_out_dims = {batch_size, seq_len_m, num_heads, seq_len_r, m_size};
+    softmax_out_dims = {batch_size, seq_len_m, num_heads, seq_len_r, m_size};
+    qktv_out_dims = {batch_size, seq_len_m, num_heads, seq_len_r, key_dim};
+    gate_out_dims = {batch_size, seq_len_m, seq_len_r, num_heads, key_dim};
+  }
+
+  int64_t GetQuerySize() const {
+    return batch_size * seq_len_m * seq_len_r * num_heads * key_dim;
+  }
+
+  Tensor* GetQKVOut(const platform::CUDADeviceContext& dev_ctx) {
+    if (!qkv_out.IsInitialized()) {
+      qkv_out.Resize(qkv_out_dims);
+      qkv_out.mutable_data<T>(dev_ctx.GetPlace());
+      VLOG(4) << "qkv_out: " << MemoryDebugString(qkv_out);
+    }
+    return &qkv_out;
+  }
+
+  Tensor* GetQueryOut(const platform::CUDADeviceContext& dev_ctx) {
+    if (!query_out.IsInitialized()) {
+      query_out.Resize(q_out_dims);
+      query_out.mutable_data<T>(dev_ctx.GetPlace());
+      VLOG(4) << "query_out: " << MemoryDebugString(query_out);
+    }
+    return &query_out;
+  }
+
+  Tensor* GetKeyOut(const platform::CUDADeviceContext& dev_ctx) {
+    if (!key_out.IsInitialized()) {
+      key_out.Resize(kv_out_dims);
+      key_out.mutable_data<T>(dev_ctx.GetPlace());
+      VLOG(4) << "key_out: " << MemoryDebugString(key_out);
+    }
+    return &key_out;
+  }
+
+  Tensor* GetValueOut(const platform::CUDADeviceContext& dev_ctx) {
+    if (!value_out.IsInitialized()) {
+      value_out.Resize(kv_out_dims);
+      value_out.mutable_data<T>(dev_ctx.GetPlace());
+      VLOG(4) << "value_out: " << MemoryDebugString(value_out);
+    }
+    return &value_out;
+  }
+
+  Tensor* GetQKOut(const platform::CUDADeviceContext& dev_ctx,
+                   Tensor* softmax_out) {
+    // softmax_dim = qk_out_dim[-1] = qk_out_dim[rank - 1]
+    int softmax_dim = m_size;
+    if (!softmax_out || phi::UseCudnnSoftmax<T>(dev_ctx, softmax_dim, true)) {
+      // Not sure whether cudnn softmax can execute inplace.
+      if (!qkv_out.IsInitialized()) {
+        qk_out.Resize(qk_out_dims);
+        qk_out.mutable_data<T>(dev_ctx.GetPlace());
+        VLOG(4) << "qk_out: " << MemoryDebugString(qk_out);
+      }
+      return &qk_out;
+    } else {
+      return softmax_out;
+    }
+  }
+
+  void ClearQKVOut() {
+    if (qkv_out.IsInitialized()) {
+      qkv_out.clear();
+    }
+  }
+
+  void ClearQKOut() {
+    if (qk_out.IsInitialized()) {
+      qk_out.clear();
+    }
+  }
+
+ protected:
+  Tensor qkv_out;
+  // QKV is not merged
+  Tensor query_out;
+  Tensor key_out;
+  Tensor value_out;
+  // qk_out = BatchedGEMM(Q, K^T)
+  // qk_out: shape=[batch_size, seq_len_m, num_heads, seq_len_r, m_size]
+  // softmax_out = softmax(qk_out + nonbatched_bias + src_mask)
+  // The shape of qk_out, softmax_out is the same, thus can be called inplace.
+  Tensor qk_out;
+};
+
+template <typename T>
+struct GateAttentionGradConfig : public GateAttentionConfig<T> {
+ public:
+  GateAttentionGradConfig(const Tensor* query, const Tensor* key,
+                          const Tensor* query_weight, const Tensor* qkv_weight,
+                          bool merge_qkv)
+      : GateAttentionConfig<T>(query, key, query_weight, qkv_weight,
+                               merge_qkv) {}
+
+  Tensor* GetQKVOutGrad(const platform::CUDADeviceContext& dev_ctx) {
+    if (!qkv_out_grad.IsInitialized()) {
+      qkv_out_grad.Resize(this->qkv_out_dims);
+      qkv_out_grad.mutable_data<T>(dev_ctx.GetPlace());
+      VLOG(4) << "qkv_out_grad: " << MemoryDebugString(qkv_out_grad);
+    }
+    return &qkv_out_grad;
+  }
+
+  Tensor* GetQueryOutGrad(const platform::CUDADeviceContext& dev_ctx) {
+    if (!query_out_grad.IsInitialized()) {
+      query_out_grad.Resize(this->q_out_dims);
+      query_out_grad.mutable_data<T>(dev_ctx.GetPlace());
+      VLOG(4) << "query_out_grad: " << MemoryDebugString(query_out_grad);
+    }
+    return &query_out_grad;
+  }
+
+  Tensor* GetKeyOutGrad(const platform::CUDADeviceContext& dev_ctx) {
+    if (!key_out_grad.IsInitialized()) {
+      key_out_grad.Resize(this->kv_out_dims);
+      key_out_grad.mutable_data<T>(dev_ctx.GetPlace());
+      VLOG(4) << "key_out_grad: " << MemoryDebugString(key_out_grad);
+    }
+    return &key_out_grad;
+  }
+
+  Tensor* GetValueOutGrad(const platform::CUDADeviceContext& dev_ctx) {
+    if (!value_out_grad.IsInitialized()) {
+      value_out_grad.Resize(this->kv_out_dims);
+      value_out_grad.mutable_data<T>(dev_ctx.GetPlace());
+      VLOG(4) << "value_out_grad: " << MemoryDebugString(value_out_grad);
+    }
+    return &value_out_grad;
+  }
+
+  Tensor* GetQKOutGrad(const platform::CUDADeviceContext& dev_ctx,
+                       Tensor* softmax_out_grad) {
+    // softmax_dim = qk_out_dim[-1] = qk_out_dim[rank - 1]
+    int softmax_dim = this->m_size;
+    if (!softmax_out_grad ||
+        phi::UseCudnnSoftmax<T>(dev_ctx, softmax_dim, true)) {
+      if (!qk_out_grad.IsInitialized()) {
+        qk_out_grad.Resize(this->qk_out_dims);
+        qk_out_grad.mutable_data<T>(dev_ctx.GetPlace());
+        VLOG(4) << "qk_out_grad: " << MemoryDebugString(qk_out_grad);
+      }
+      return &qk_out_grad;
+    } else {
+      return softmax_out_grad;
+    }
+  }
+
+ protected:
+  Tensor qkv_out_grad;
+  Tensor query_out_grad;
+  Tensor key_out_grad;
+  Tensor value_out_grad;
+  Tensor qk_out_grad;
+};
+
+template <typename T>
+class FMHAGateRef {
+ public:
+  FMHAGateRef(const platform::CUDADeviceContext& dev_ctx, bool merge_qkv)
+      : dev_ctx_(dev_ctx), merge_qkv_(merge_qkv) {}
+
+  void ComputeForward(const Tensor* nonbatched_bias, const Tensor* src_mask,
+                      Tensor* q_transpose_out, Tensor* k_transpose_out,
+                      Tensor* v_transpose_out, Tensor* qkv_transpose_out,
+                      Tensor* softmax_out, Tensor* fmha_out,
+                      GateAttentionConfig<T>* config) {
+    T* q_ptr = nullptr;
+    T* k_ptr = nullptr;
+    T* v_ptr = nullptr;
+    if (merge_qkv_) {
+      // qkv_transpose_out = transpose(qkv_out)
+      PADDLE_ENFORCE_NOT_NULL(
+          qkv_transpose_out,
+          platform::errors::NotFound("The input qkv_transpose_out can not be "
+                                     "nullptr when merge_qkv is true."));
+
+      Tensor* qkv_out = config->GetQKVOut(dev_ctx_);
+      ComputeQKVTransposeForward(*qkv_out, qkv_transpose_out);
+      config->ClearQKVOut();
+
+      // q_size == k_size
+      int64_t q_size = config->GetQuerySize();
+      q_ptr = qkv_transpose_out->data<T>();
+      k_ptr = q_ptr + q_size;
+      v_ptr = k_ptr + q_size;
+    } else {
+      PADDLE_ENFORCE_NOT_NULL(
+          q_transpose_out,
+          platform::errors::NotFound("The input q_transpose_out can not be "
+                                     "nullptr when merge_qkv is false."));
+      PADDLE_ENFORCE_NOT_NULL(
+          k_transpose_out,
+          platform::errors::NotFound("The input k_transpose_out can not be "
+                                     "nullptr when merge_qkv is false."));
+      PADDLE_ENFORCE_NOT_NULL(
+          v_transpose_out,
+          platform::errors::NotFound("The input v_transpose_out can not be "
+                                     "nullptr when merge_qkv is false."));
+
+      Tensor* query_out = config->GetQueryOut(dev_ctx_);
+      Tensor* key_out = config->GetKeyOut(dev_ctx_);
+      Tensor* value_out = config->GetValueOut(dev_ctx_);
+      ComputeQKVTransposeForward(*query_out, *key_out, *value_out,
+                                 q_transpose_out, k_transpose_out,
+                                 v_transpose_out);
+
+      // q_size != k_size
+      q_ptr = q_transpose_out->data<T>();
+      k_ptr = k_transpose_out->data<T>();
+      v_ptr = v_transpose_out->data<T>();
+    }
+
+    // qk_out = BatchedGEMM(Q, K^T)
+    // [batch_size, seq_len_m, num_heads, seq_len_r, key_dim] *
+    //                [batch_size, seq_len_m, num_heads, m_size, key_dim]
+    // -> [batch_size, seq_len_m, num_heads, seq_len_r, m_size]
+    Tensor* qk_out = config->GetQKOut(dev_ctx_, softmax_out);
+    T* qk_out_ptr = qk_out->data<T>();
+
+    int64_t gemm_batch_size =
+        config->batch_size * config->seq_len_m * config->num_heads;
+    int64_t gemm_m = config->seq_len_r;
+    int64_t gemm_n = config->m_size;
+    int64_t gemm_k = config->key_dim;
+
+    T alpha = static_cast<T>(1.0 / sqrt(config->key_dim));
+    ComputeBatchedGEMM(q_ptr, k_ptr, qk_out_ptr, false, true, gemm_m, gemm_n,
+                       gemm_k, gemm_batch_size, alpha);
+
+    // softmax_out = softmax(qk_out + nonbatched_bias + src_mask)
+    ComputeBiasMaskSoftmaxForward(nonbatched_bias, src_mask, qk_out,
+                                  softmax_out);
+    config->ClearQKOut();
+
+    // qktv_out = BatchedGEMM(softmax_out, V)
+    // [batch_size, seq_len_m, num_heads, seq_len_r, m_size] *
+    //               [batch_size, seq_len_m, num_heads, m_size, key_dim]
+    // -> [batch_size, seq_len_m, num_heads, seq_len_r, key_dim]
+    Tensor qktv_out;
+    qktv_out.Resize(config->qktv_out_dims);
+    T* qktv_out_ptr = qktv_out.mutable_data<T>(dev_ctx_.GetPlace());
+
+    gemm_m = config->seq_len_r;
+    gemm_n = config->key_dim;
+    gemm_k = config->m_size;
+
+    T* softmax_out_ptr = softmax_out->data<T>();
+    ComputeBatchedGEMM(softmax_out_ptr, v_ptr, qktv_out_ptr, false, false,
+                       gemm_m, gemm_n, gemm_k, gemm_batch_size);
+
+    // fmha_out = transpose(qktv_out)
+    ComputeQKTVTransposeForward(qktv_out, fmha_out);
+  }
+
+  void ComputeBackward(const Tensor* q_transpose_out,
+                       const Tensor* k_transpose_out,
+                       const Tensor* v_transpose_out,
+                       const Tensor* qkv_transpose_out,
+                       const Tensor* softmax_out, const Tensor* fmha_out_grad,
+                       Tensor* src_mask_grad, Tensor* nonbatched_bias_grad,
+                       GateAttentionGradConfig<T>* config) {
+    const T* q_ptr = nullptr;
+    const T* k_ptr = nullptr;
+    const T* v_ptr = nullptr;
+
+    T* q_grad_ptr = nullptr;
+    T* k_grad_ptr = nullptr;
+    T* v_grad_ptr = nullptr;
+
+    Tensor q_transpose_out_grad;
+    Tensor k_transpose_out_grad;
+    Tensor v_transpose_out_grad;
+    Tensor qkv_transpose_out_grad;
+    if (merge_qkv_) {
+      PADDLE_ENFORCE_NOT_NULL(
+          qkv_transpose_out,
+          platform::errors::NotFound("The input qkv_transpose_out can not be "
+                                     "nullptr when merge_qkv is true."));
+
+      int64_t q_size = config->GetQuerySize();
+      q_ptr = qkv_transpose_out->data<T>();
+      k_ptr = q_ptr + q_size;
+      v_ptr = k_ptr + q_size;
+
+      qkv_transpose_out_grad.Resize(config->qkv_transpose_out_dims);
+
+      q_grad_ptr = qkv_transpose_out_grad.mutable_data<T>(dev_ctx_.GetPlace());
+      k_grad_ptr = q_grad_ptr + q_size;
+      v_grad_ptr = k_grad_ptr + q_size;
+    } else {
+      PADDLE_ENFORCE_NOT_NULL(
+          q_transpose_out,
+          platform::errors::NotFound("The input q_transpose_out can not be "
+                                     "nullptr when merge_qkv is false."));
+      PADDLE_ENFORCE_NOT_NULL(
+          k_transpose_out,
+          platform::errors::NotFound("The input k_transpose_out can not be "
+                                     "nullptr when merge_qkv is false."));
+      PADDLE_ENFORCE_NOT_NULL(
+          v_transpose_out,
+          platform::errors::NotFound("The input v_transpose_out can not be "
+                                     "nullptr when merge_qkv is false."));
+
+      q_ptr = q_transpose_out->data<T>();
+      k_ptr = k_transpose_out->data<T>();
+      v_ptr = v_transpose_out->data<T>();
+
+      q_transpose_out_grad.Resize(config->q_transpose_out_dims);
+      k_transpose_out_grad.Resize(config->kv_transpose_out_dims);
+      v_transpose_out_grad.Resize(config->kv_transpose_out_dims);
+
+      q_grad_ptr = q_transpose_out_grad.mutable_data<T>(dev_ctx_.GetPlace());
+      k_grad_ptr = k_transpose_out_grad.mutable_data<T>(dev_ctx_.GetPlace());
+      v_grad_ptr = v_transpose_out_grad.mutable_data<T>(dev_ctx_.GetPlace());
+    }
+
+    Tensor softmax_out_grad;
+    softmax_out_grad.Resize(config->softmax_out_dims);
+    softmax_out_grad.mutable_data<T>(dev_ctx_.GetPlace());
+
+    int64_t gemm_batch_size =
+        config->batch_size * config->seq_len_m * config->num_heads;
+    {
+      // Forward: fmha_out = transpose(qktv_out)
+      Tensor qktv_out_grad;
+      qktv_out_grad.Resize(config->qktv_out_dims);
+      T* qktv_out_grad_ptr = qktv_out_grad.mutable_data<T>(dev_ctx_.GetPlace());
+      ComputeQKTVTransposeBackward(*fmha_out_grad, &qktv_out_grad);
+
+      // Forward: qktv_out = BatchedGEMM(softmax_out, V)
+      // Backward:
+      //  V_grad = BatchedGEMM(softmax_out^T, qktv_out_grad) (dy = x^T * dout)
+      int64_t gemm_m = config->m_size;
+      int64_t gemm_n = config->key_dim;
+      int64_t gemm_k = config->seq_len_r;
+
+      const T* softmax_out_ptr = softmax_out->data<T>();
+      ComputeBatchedGEMM(softmax_out_ptr, qktv_out_grad_ptr, v_grad_ptr, true,
+                         false, gemm_m, gemm_n, gemm_k, gemm_batch_size);
+
+      // Backward: softmax_out_grad = qktv_out_grad * V^T (dx = dout * y^T)
+      gemm_m = config->seq_len_r;
+      gemm_n = config->m_size;
+      gemm_k = config->key_dim;
+
+      T* softmax_out_grad_ptr = softmax_out_grad.data<T>();
+      ComputeBatchedGEMM(qktv_out_grad_ptr, v_ptr, softmax_out_grad_ptr, false,
+                         true, gemm_m, gemm_n, gemm_k, gemm_batch_size);
+    }
+
+    Tensor* qk_out_grad = config->GetQKOutGrad(dev_ctx_, &softmax_out_grad);
+    ComputeBiasMaskSoftmaxBackward(&softmax_out_grad, softmax_out,
+                                   src_mask_grad, qk_out_grad,
+                                   nonbatched_bias_grad);
+
+    // Forward: qk_out = BatchedGEMM(Q, K^T)
+    // Backward: k_grad = BatchedGEMM(qk_out_grad^T, Q) (dy = dout^t * x)
+    int64_t gemm_m = config->m_size;
+    int64_t gemm_n = config->key_dim;
+    int64_t gemm_k = config->seq_len_r;
+    T alpha = static_cast<T>(1.0 / sqrt(config->key_dim));
+
+    T* qk_out_grad_ptr = qk_out_grad->data<T>();
+    ComputeBatchedGEMM(qk_out_grad_ptr, q_ptr, k_grad_ptr, true, false, gemm_m,
+                       gemm_n, gemm_k, gemm_batch_size, alpha);
+
+    // Backward: q_grad = BatchedGEMM(qk_out_grad, K) (dx = dout * y)
+    gemm_m = config->seq_len_r;
+    gemm_n = config->key_dim;
+    gemm_k = config->m_size;
+    ComputeBatchedGEMM(qk_out_grad_ptr, k_ptr, q_grad_ptr, false, false, gemm_m,
+                       gemm_n, gemm_k, gemm_batch_size, alpha);
+
+    if (merge_qkv_) {
+      Tensor* qkv_out_grad = config->GetQKVOutGrad(dev_ctx_);
+      ComputeQKVTransposeBackward(qkv_transpose_out_grad, qkv_out_grad);
+    } else {
+      Tensor* q_out_grad = config->GetQueryOutGrad(dev_ctx_);
+      Tensor* k_out_grad = config->GetKeyOutGrad(dev_ctx_);
+      Tensor* v_out_grad = config->GetValueOutGrad(dev_ctx_);
+      ComputeQKVTransposeBackward(q_transpose_out_grad, k_transpose_out_grad,
+                                  v_transpose_out_grad, q_out_grad, k_out_grad,
+                                  v_out_grad);
+    }
+  }
+
+  void ComputeQKVTransposeForward(const Tensor& q_out, const Tensor& k_out,
+                                  const Tensor& v_out, Tensor* q_transpose_out,
+                                  Tensor* k_transpose_out,
+                                  Tensor* v_transpose_out) {
+    int ndims = 5;
+    std::vector<int> perm = {0, 1, 3, 2, 4};
+    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, q_out, perm, q_transpose_out);
+    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, k_out, perm, k_transpose_out);
+    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, v_out, perm, v_transpose_out);
+  }
+
+  void ComputeQKVTransposeBackward(const Tensor& q_transpose_out_grad,
+                                   const Tensor& k_transpose_out_grad,
+                                   const Tensor& v_transpose_out_grad,
+                                   Tensor* q_out_grad, Tensor* k_out_grad,
+                                   Tensor* v_out_grad) {
+    int ndims = 5;
+    std::vector<int> perm = {0, 1, 3, 2, 4};
+    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, q_transpose_out_grad, perm,
+                                q_out_grad);
+    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, k_transpose_out_grad, perm,
+                                k_out_grad);
+    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, v_transpose_out_grad, perm,
+                                v_out_grad);
+  }
+
+  // [batch_size, seq_len_m, seq_len_r, 3, num_heads, key_dim] ->
+  //         [3, batch_size, seq_len_m, num_heads, seq_len_r, key_dim]
+  void ComputeQKVTransposeForward(const Tensor& qkv_out,
+                                  Tensor* qkv_transpose_out) {
+    int ndims = 6;
+    std::vector<int> perm = {3, 0, 1, 4, 2, 5};
+    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, qkv_out, perm,
+                                qkv_transpose_out);
+  }
+
+  void ComputeQKVTransposeBackward(const Tensor& qkv_transpose_out_grad,
+                                   Tensor* qkv_out_grad) {
+    int ndims = 6;
+    std::vector<int> perm = {1, 2, 4, 0, 3, 5};
+    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, qkv_transpose_out_grad, perm,
+                                qkv_out_grad);
+  }
+
+  // [batch_size, seq_len_m, num_head, seq_len_r, c] ->
+  //         [batch_size, seq_len_m, seq_len_r, num_head, c]
+  void ComputeQKTVTransposeForward(const Tensor& qktv_out, Tensor* fmha_out) {
+    int ndims = 5;
+    std::vector<int> perm = {0, 1, 3, 2, 4};
+    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, qktv_out, perm, fmha_out);
+  }
+
+  void ComputeQKTVTransposeBackward(const Tensor& fmha_out_grad,
+                                    Tensor* qktv_out_grad) {
+    int ndims = 5;
+    std::vector<int> perm = {0, 1, 3, 2, 4};
+    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, fmha_out_grad, perm,
+                                qktv_out_grad);
+  }
+
+  // qk_out = qk_out + nonbatched_bias + src_mask
+  // softmax_out = softmax(src_mask_out)
+  void ComputeBiasMaskSoftmaxForward(const Tensor* nonbatched_bias,
+                                     const Tensor* src_mask, Tensor* qk_out,
+                                     Tensor* softmax_out) {
+    if (nonbatched_bias) {
+      std::vector<const Tensor*> ins = {qk_out, nonbatched_bias, src_mask};
+      std::vector<Tensor*> outs = {qk_out};
+      phi::funcs::BroadcastKernel<ElementwiseType::kTernary, T, T>(
+          dev_ctx_, ins, &outs, -1, TernaryAddFunctor<T>());
+    } else {
+      std::vector<const Tensor*> ins = {qk_out, src_mask};
+      std::vector<Tensor*> outs = {qk_out};
+      phi::funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(
+          dev_ctx_, ins, &outs, -1, phi::funcs::AddFunctor<T>());
+    }
+    phi::SoftmaxForwardCUDAKernelDriver<T>(dev_ctx_, *qk_out, -1, softmax_out);
+  }
+
+  // src_mask_out = qk_out + nonbatched_bias + src_mask
+  // softmax_out = softmax(src_mask_out)
+  void ComputeBiasMaskSoftmaxBackward(const Tensor* softmax_out_grad,
+                                      const Tensor* softmax_out,
+                                      Tensor* src_mask_grad,
+                                      Tensor* qk_out_grad,
+                                      Tensor* nonbatched_bias_grad) {
+    PADDLE_ENFORCE_NOT_NULL(
+        qk_out_grad,
+        platform::errors::NotFound("The qk_out_grad can not be nullptr."));
+
+    PADDLE_ENFORCE_EQ(qk_out_grad->dims(), softmax_out->dims(),
+                      platform::errors::InvalidArgument(
+                          "The shape of qk_out_grad and softmax_out is "
+                          "expected to be the same. But recieved qk_out_grad's "
+                          "shape = %s, softmax_out's shape = %s.",
+                          qk_out_grad->dims(), softmax_out->dims()));
+
+    PADDLE_ENFORCE_EQ(src_mask_grad, nullptr,
+                      platform::errors::InvalidArgument(
+                          "src_mask_grad is expected to be nullptr."));
+
+    phi::SoftmaxBackwardCUDAKernelDriver<T>(dev_ctx_, *softmax_out,
+                                            *softmax_out_grad, -1, qk_out_grad);
+
+    // [1, bs, num_head, seq_l, seq_l] -> [bs, num_head, seq_l, seq_l]
+    if (nonbatched_bias_grad) {
+      gpuStream_t stream = dev_ctx_.stream();
+      TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+          dev_ctx_, *qk_out_grad, nonbatched_bias_grad,
+          kps::IdentityFunctor<T>(), {0, 1}, stream);
+    }
+  }
+
+ private:
+  void ComputeBatchedGEMM(const T* a_ptr, const T* b_ptr, T* c_ptr,
+                          bool trans_a, bool trans_b, int64_t m, int64_t n,
+                          int64_t k, int64_t batch_size,
+                          T alpha = static_cast<T>(1.0),
+                          T beta = static_cast<T>(0.0)) {
+    CBLAS_TRANSPOSE cblas_trans_a = trans_a ? CblasTrans : CblasNoTrans;
+    CBLAS_TRANSPOSE cblas_trans_b = trans_b ? CblasTrans : CblasNoTrans;
+    int64_t stride_a = m * k;
+    int64_t stride_b = k * n;
+
+    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
+    blas.BatchedGEMM(cblas_trans_a, cblas_trans_b, m, n, k, alpha, a_ptr, b_ptr,
+                     beta, c_ptr, batch_size, stride_a, stride_b);
+  }
+
+  const platform::CUDADeviceContext& dev_ctx_;
+  bool merge_qkv_;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cc b/paddle/fluid/operators/fused/fused_gate_attention_op.cc
new file mode 100644
index 0000000000000..ba9dbd82e3dcc
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cc
@@ -0,0 +1,317 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+class FusedGateAttentionOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("Query"), "Input", "Query",
+                   "fused_gate_attention");
+    OP_INOUT_CHECK(ctx->HasInput("OutLinearWeight"), "Input", "OutLinearWeight",
+                   "fused_gate_attention");
+    OP_INOUT_CHECK(ctx->HasInput("OutLinearBias"), "Input", "OutLinearBias",
+                   "fused_gate_attention");
+
+    OP_INOUT_CHECK(ctx->HasOutput("SoftmaxOut"), "Output", "SoftmaxOut",
+                   "fused_gate_attention");
+    OP_INOUT_CHECK(ctx->HasOutput("FMHAOut"), "Output", "FMHAOut",
+                   "fused_gate_attention");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
+                   "fused_gate_attention");
+
+    auto input_q_dims = ctx->GetInputDim("Query");
+    int batch_size = input_q_dims[0];
+    int seq_len_m = input_q_dims[1];
+    int seq_len_r = input_q_dims[2];
+
+    int num_head, m_size, key_dim;
+    if (ctx->Attrs().Get<bool>("merge_qkv")) {
+      // QKV's input: [batch_size, seq_len_m, seq_len_r, qkv_dim]
+      // QKV's weight: [3, num_head, key_dim, qkv_dim]
+      OP_INOUT_CHECK(ctx->HasInput("QKVWeight"), "Input", "QKVWeight",
+                     "fused_gate_attention");
+      OP_INOUT_CHECK(ctx->HasOutput("QKVTransposeOut"), "Output",
+                     "QKVTransposeOut", "fused_gate_attention");
+
+      auto qkv_w_dims = ctx->GetInputDim("QKVWeight");
+
+      num_head = qkv_w_dims[1];
+      key_dim = qkv_w_dims[2];
+      m_size = seq_len_r;
+
+      ctx->SetOutputDim("QKVTransposeOut", {3, batch_size, seq_len_m, num_head,
+                                            seq_len_r, key_dim});
+    } else {
+      OP_INOUT_CHECK(ctx->HasInput("QueryWeight"), "Input", "QueryWeight",
+                     "fused_gate_attention");
+      OP_INOUT_CHECK(ctx->HasInput("KeyWeight"), "Input", "KeyWeight",
+                     "fused_gate_attention");
+      OP_INOUT_CHECK(ctx->HasInput("ValueWeight"), "Input", "ValueWeight",
+                     "fused_gate_attention");
+
+      auto input_k_dims = ctx->GetInputDim("Key");
+      auto q_w_dims = ctx->GetInputDim("QueryWeight");
+
+      num_head = q_w_dims[1];
+      key_dim = q_w_dims[2];
+      m_size = input_k_dims[2];
+
+      ctx->SetOutputDim("QueryTransposeOut",
+                        {batch_size, seq_len_m, num_head, seq_len_r, key_dim});
+      ctx->SetOutputDim("KeyTransposeOut",
+                        {batch_size, seq_len_m, num_head, m_size, key_dim});
+      ctx->SetOutputDim("ValueTransposeOut",
+                        {batch_size, seq_len_m, num_head, m_size, key_dim});
+    }
+
+    ctx->SetOutputDim("SoftmaxOut",
+                      {batch_size, seq_len_m, num_head, seq_len_r, m_size});
+    ctx->SetOutputDim("FMHAOut",
+                      {batch_size, seq_len_m, seq_len_r, num_head, key_dim});
+
+    if (ctx->Attrs().Get<bool>("has_gating")) {
+      OP_INOUT_CHECK(ctx->HasInput("GateWeight"), "Input", "GateWeight",
+                     "fused_gate_attention");
+      OP_INOUT_CHECK(ctx->HasInput("GateBias"), "Input", "GateBias",
+                     "fused_gate_attention");
+      ctx->SetOutputDim("GateOut",
+                        {batch_size, seq_len_m, seq_len_r, num_head, key_dim});
+    }
+
+    ctx->SetOutputDim("Out", ctx->GetInputDim("Query"));
+  }
+};
+
+class FusedGateAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Query", "The query tensor.");
+    AddInput("Key", "The key tensor.").AsDispensable();
+    AddInput("QueryWeight", "(optional) The query weight tensor.")
+        .AsDispensable();
+    AddInput("KeyWeight", "(optional)  The key weight tensor.").AsDispensable();
+    AddInput("ValueWeight", "(optional)  The value weight tensor.")
+        .AsDispensable();
+    AddInput("QKVWeight", "(optional)  The qkv weight tensor.").AsDispensable();
+    AddInput("NonbatchedBias", "(optional) The nonbatchedBias tensor.")
+        .AsDispensable();
+    AddInput("SrcMask", "The attention mask tensor in fmha.");
+    AddInput("GateWeight", "(optional) The gate weight tensor.")
+        .AsDispensable();
+    AddInput("GateBias", "(optional) The gate bias tensor.").AsDispensable();
+    AddInput("OutLinearWeight", "The out_linear weight tensor.");
+    AddInput("OutLinearBias", "The out_linear bias tensor.");
+    AddOutput("QueryTransposeOut", "The transposed result of query matmul.")
+        .AsIntermediate()
+        .AsDispensable();
+    AddOutput("KeyTransposeOut", "The transposed result of key matmul.")
+        .AsIntermediate()
+        .AsDispensable();
+    AddOutput("ValueTransposeOut", "The transposed result of value matmul.")
+        .AsIntermediate()
+        .AsDispensable();
+    AddOutput("QKVTransposeOut", "The transposed result of merged QKV matmul.")
+        .AsIntermediate()
+        .AsDispensable();
+    AddOutput("SoftmaxOut", "Result in fmha.").AsIntermediate();
+    AddOutput("FMHAOut", "Result in fmha.").AsIntermediate();
+    AddOutput("GateOut", "Result of the gating module.")
+        .AsIntermediate()
+        .AsDispensable();
+    AddOutput("Out", "Result after attention.");
+    AddAttr<bool>("has_gating",
+                  "if true, the attention op uses gate architecure, "
+                  "[default true].")
+        .SetDefault(true);
+    AddAttr<bool>("merge_qkv",
+                  "if true, calculation with merged qkv, "
+                  "[default true].")
+        .SetDefault(true);
+    AddComment(R"DOC(
+  Add fused attention op whose logic is as follows:
+  {
+    q = paddle.einsum('nbqa,ahc->nbqhc', q_data, self.query_w) 
+    k = paddle.einsum('nbka,ahc->nbkhc', m_data, self.key_w)
+    v = paddle.einsum('nbka,ahc->nbkhc', m_data, self.value_w)
+
+    logits = paddle.einsum('nbqhc,nbkhc->nbhqk', q * c , k) + bias
+    weights = nn.functional.softmax(logits)
+    weighted_avg = paddle.einsum('nbhqk,nbkhc->nbqhc', weights, v)
+    if nonbatched_bias is not None:
+      logits += paddle.unsqueeze(nonbatched_bias, axis=1)
+
+    if self.gating:
+        gate_values = paddle.einsum('nbqc,chv->nbqhv', q_data,
+                                    self.gating_w) + self.gating_b
+        gate_values_1 = nn.functional.sigmoid(gate_values)
+        weighted_avg *= gate_values_1
+    
+    output = paddle.einsum('nbqhc,hco->nbqo', weighted_avg,
+                          self.output_w) + self.output_b
+                
+  }
+    )DOC");
+  }
+};
+
+class FusedGateAttentionGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("Query"), "Input", "Query",
+                   "fused_gate_attention_grad");
+    if (ctx->HasOutput(framework::GradVarName("Query"))) {
+      ctx->SetOutputDim(framework::GradVarName("Query"),
+                        ctx->GetInputDim("Query"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Key"))) {
+      ctx->SetOutputDim(framework::GradVarName("Key"), ctx->GetInputDim("Key"));
+    }
+
+    if (ctx->Attrs().Get<bool>("merge_qkv")) {
+      OP_INOUT_CHECK(ctx->HasInput("QKVWeight"), "Input", "QKVWeight",
+                     "fused_gate_attention_arad");
+      ctx->SetOutputDim(framework::GradVarName("QKVWeight"),
+                        ctx->GetInputDim("QKVWeight"));
+    } else {
+      OP_INOUT_CHECK(ctx->HasInput("QueryWeight"), "Input", "QueryWeight",
+                     "fused_aate_attention_arad");
+      OP_INOUT_CHECK(ctx->HasInput("KeyWeight"), "Input", "KeyWeight",
+                     "fused_aate_attention_arad");
+      OP_INOUT_CHECK(ctx->HasInput("ValueWeight"), "Input", "ValueWeight",
+                     "fused_aate_attention_arad");
+
+      for (auto& name : {"QueryWeight", "KeyWeight", "ValueWeight"}) {
+        ctx->SetOutputDim(framework::GradVarName(name), ctx->GetInputDim(name));
+      }
+    }
+
+    OP_INOUT_CHECK(ctx->HasInput("OutLinearWeight"), "Input", "OutLinearWeight",
+                   "fused_aate_attention_arad");
+
+    if (ctx->Attrs().Get<bool>("has_gating")) {
+      for (auto& name : {"GateWeight", "GateBias", "GateOut"}) {
+        ctx->SetOutputDim(framework::GradVarName(name), ctx->GetInputDim(name));
+      }
+    }
+
+    if (ctx->HasOutput(framework::GradVarName("NonbatchedBias"))) {
+      ctx->SetOutputDim(framework::GradVarName("NonbatchedBias"),
+                        ctx->GetInputDim("NonbatchedBias"));
+    }
+
+    ctx->SetOutputDim(framework::GradVarName("FMHAOut"),
+                      ctx->GetInputDim("FMHAOut"));
+
+    ctx->SetOutputDim(framework::GradVarName("OutLinearWeight"),
+                      ctx->GetInputDim("OutLinearWeight"));
+    ctx->SetOutputDim(framework::GradVarName("OutLinearBias"),
+                      ctx->GetInputDim("OutLinearBias"));
+  }
+};
+
+template <typename T>
+class FusedGateAttentionGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("fused_gate_attention_grad");
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+
+    op->SetInput("Query", this->Input("Query"));
+    op->SetOutput(framework::GradVarName("Query"), this->InputGrad("Query"));
+
+    op->SetAttrMap(this->Attrs());
+    bool merge_qkv = BOOST_GET_CONST(bool, op->GetAttr("merge_qkv"));
+    if (merge_qkv) {
+      op->SetInput("QKVWeight", this->Input("QKVWeight"));
+      op->SetOutput(framework::GradVarName("QKVWeight"),
+                    this->InputGrad("QKVWeight"));
+      op->SetInput("QKVTransposeOut", this->Output("QKVTransposeOut"));
+    } else {
+      op->SetInput("Key", this->Input("Key"));
+      op->SetOutput(framework::GradVarName("Key"), this->InputGrad("Key"));
+
+      for (auto& name : {"QueryWeight", "KeyWeight", "ValueWeight"}) {
+        op->SetInput(name, this->Input(name));
+        op->SetOutput(framework::GradVarName(name), this->InputGrad(name));
+      }
+
+      for (auto& name :
+           {"QueryTransposeOut", "KeyTransposeOut", "ValueTransposeOut"}) {
+        op->SetInput(name, this->Output(name));
+      }
+    }
+
+    op->SetInput("FMHAOut", this->Output("FMHAOut"));
+    op->SetOutput(framework::GradVarName("FMHAOut"),
+                  this->OutputGrad("FMHAOut"));
+
+    if (this->HasInput("NonbatchedBias")) {
+      op->SetInput("NonbatchedBias", this->Input("NonbatchedBias"));
+      op->SetOutput(framework::GradVarName("NonbatchedBias"),
+                    this->InputGrad("NonbatchedBias"));
+    }
+
+    op->SetInput("SoftmaxOut", this->Output("SoftmaxOut"));
+
+    bool has_gating = BOOST_GET_CONST(bool, op->GetAttr("has_gating"));
+    if (has_gating) {
+      op->SetInput("GateWeight", this->Input("GateWeight"));
+      op->SetOutput(framework::GradVarName("GateWeight"),
+                    this->InputGrad("GateWeight"));
+
+      op->SetInput("GateBias", this->Input("GateBias"));
+      op->SetOutput(framework::GradVarName("GateBias"),
+                    this->InputGrad("GateBias"));
+
+      op->SetInput("GateOut", this->Output("GateOut"));
+      op->SetOutput(framework::GradVarName("GateOut"),
+                    this->OutputGrad("GateOut"));
+    }
+
+    op->SetInput("OutLinearWeight", this->Input("OutLinearWeight"));
+    op->SetOutput(framework::GradVarName("OutLinearWeight"),
+                  this->InputGrad("OutLinearWeight"));
+
+    op->SetInput("OutLinearBias", this->Input("OutLinearBias"));
+    op->SetOutput(framework::GradVarName("OutLinearBias"),
+                  this->InputGrad("OutLinearBias"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    fused_gate_attention, ops::FusedGateAttentionOp,
+    ops::FusedGateAttentionOpMaker,
+    ops::FusedGateAttentionGradOpMaker<paddle::framework::OpDesc>,
+    ops::FusedGateAttentionGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(fused_gate_attention_grad, ops::FusedGateAttentionGradOp);
diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cu b/paddle/fluid/operators/fused/fused_gate_attention_op.cu
new file mode 100644
index 0000000000000..b1badf72557ae
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cu
@@ -0,0 +1,488 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/fused/attn_gemm.h"
+#include "paddle/fluid/operators/fused/fused_gate_attention.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+struct SigmoidMultiplyFunctor {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // sigmoid(x) = 1 / (1 + exp(-x))
+  // out = sigmoid(x) * y
+  inline HOSTDEVICE T operator()(T x, T y) const {
+    MPType x_mp = static_cast<MPType>(x);
+    T sigmoid_out = static_cast<T>(one / (one + exp(-x_mp)));
+    return sigmoid_out * y;
+  }
+};
+
+template <typename T>
+struct SigmoidMultiplyGradFunctor {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // Gradient of Multiply:
+  //  dx = dout * y
+  //  dy = dout * x
+  // Gradient of Sigmoid: dx = dout * out * (1 - out)
+  inline HOSTDEVICE phi::Array<T, 2> operator()(const T dout, const T x,
+                                                T y) const {
+    MPType x_mp = static_cast<MPType>(x);
+    T sigmoid_out = static_cast<T>(one / (one + exp(-x_mp)));
+    T d_sigmoid_out = dout * y;
+    phi::Array<T, 2> outs;
+    outs[0] = d_sigmoid_out * sigmoid_out *
+              (static_cast<T>(1.0f) - sigmoid_out);  // dx
+    outs[1] = dout * sigmoid_out;                    // dy
+    return outs;
+  }
+};
+
+template <typename T>
+void ComputeMergedQKVMatmulForward(const framework::ExecutionContext &ctx,
+                                   const GateAttentionConfig<T> &config,
+                                   const Tensor *query, Tensor *qkv_out) {
+  // query: shape=[batch_size, seq_len_m, seq_len_r, qkv_dim]
+  // qkv_weight: shape=[3, num_heads, key_dim, qkv_dim]
+  // qkv_out: shape=[batch_size, seq_len_m, seq_len_r, 3, num_heads, key_dim]
+  auto *qkv_weight = ctx.Input<Tensor>("QKVWeight");
+
+  // qkv_out = GEMM(query, qkv_weight^T)
+  int m = config.batch_size * config.seq_len_m * config.seq_len_r;
+  int n = 3 * config.num_heads * config.key_dim;
+  int k = config.q_dim;
+  auto qkv_compute =
+      AttnMatMul<T>(ctx.cuda_device_context(), false, true, m, n, k, false);
+  qkv_compute.ComputeForward(qkv_weight, query, nullptr, qkv_out, nullptr);
+}
+
+template <typename T>
+Tensor *ComputeMergedQKVMatmulBackward(const framework::ExecutionContext &ctx,
+                                       const GateAttentionGradConfig<T> &config,
+                                       const Tensor *query,
+                                       const Tensor *qkv_out_grad,
+                                       Tensor *query_grad, bool use_addto) {
+  auto *qkv_weight = ctx.Input<Tensor>("QKVWeight");
+  auto *qkv_weight_grad =
+      ctx.Output<Tensor>(framework::GradVarName("QKVWeight"));
+  qkv_weight_grad->mutable_data<T>(ctx.GetPlace());
+
+  // Gradient of GEMM(query, qkv_weight)
+  int m = config.batch_size * config.seq_len_m * config.seq_len_r;
+  int n = 3 * config.num_heads * config.key_dim;
+  int k = config.q_dim;
+  auto qkv_compute =
+      AttnMatMul<T>(ctx.cuda_device_context(), false, true, m, n, k, false);
+  qkv_compute.ComputeBackward(query, qkv_weight, qkv_out_grad, query_grad,
+                              qkv_weight_grad, nullptr, use_addto);
+  return query_grad;
+}
+
+template <typename T>
+void ComputeSeparatedQKVMatmulForward(const framework::ExecutionContext &ctx,
+                                      const GateAttentionConfig<T> &config,
+                                      const Tensor *query, const Tensor *key,
+                                      Tensor *query_out, Tensor *key_out,
+                                      Tensor *value_out) {
+  auto *query_weight = ctx.Input<Tensor>("QueryWeight");
+  auto *key_weight = ctx.Input<Tensor>("KeyWeight");
+  auto *value_weight = ctx.Input<Tensor>("ValueWeight");
+
+  // query_out = GEMM(query, query_weight)
+  // query: shape=[batch_size, seq_len_m, seq_len_r, q_dim]
+  // query_weight: shape=[q_dim, num_heads, key_dim]
+  // query_out: shape=[batch_size, seq_len_m, seq_len_r, num_heads, key_dim]
+  int q_m = config.batch_size * config.seq_len_m * config.seq_len_r;
+  int q_n = config.num_heads * config.key_dim;
+  int q_k = config.q_dim;
+  auto q_compute = AttnMatMul<T>(ctx.cuda_device_context(), false, false, q_m,
+                                 q_n, q_k, false);
+  q_compute.ComputeForward(query_weight, query, nullptr, query_out, nullptr);
+
+  // k_out = GEMM(key, key_weight)
+  // key: shape=[batch_size, seq_len_m, m_size, kv_dim]
+  // key_weight: shape=[kv_dim, num_heads, key_dim]
+  // key_out: shape=[batch_size, seq_len_m, m_size, num_heads, key_dim]
+  int kv_m = config.batch_size * config.seq_len_m * config.m_size;
+  int kv_n = config.num_heads * config.key_dim;
+  int kv_k = config.kv_dim;
+  auto kv_compute = AttnMatMul<T>(ctx.cuda_device_context(), false, false, kv_m,
+                                  kv_n, kv_k, false);
+  kv_compute.ComputeForward(key_weight, key, nullptr, key_out, nullptr);
+
+  // value_out = GEMM(value, value_weight)
+  kv_compute.ComputeForward(value_weight, key, nullptr, value_out, nullptr);
+}
+
+template <typename T>
+Tensor *ComputeSeparatedQKVMatmulBackward(
+    const framework::ExecutionContext &ctx,
+    const GateAttentionGradConfig<T> &config, const Tensor *query,
+    const Tensor *key, const Tensor *query_out_grad, const Tensor *key_out_grad,
+    const Tensor *value_out_grad, Tensor *query_grad, Tensor *key_grad,
+    bool use_addto) {
+  // Gradient of GEMM(key, k_weight)
+  const auto *key_weight = ctx.Input<Tensor>("KeyWeight");
+  auto *key_weight_grad =
+      ctx.Output<Tensor>(framework::GradVarName("KeyWeight"));
+  key_weight_grad->mutable_data<T>(ctx.GetPlace());
+
+  int kv_m = config.batch_size * config.seq_len_m * config.m_size;
+  int kv_n = config.num_heads * config.key_dim;
+  int kv_k = config.kv_dim;
+  auto kv_compute = AttnMatMul<T>(ctx.cuda_device_context(), false, false, kv_m,
+                                  kv_n, kv_k, false);
+  kv_compute.ComputeBackward(key, key_weight, key_out_grad, key_grad,
+                             key_weight_grad, nullptr, false);
+
+  // Gradient of GEMM(value, v_weight)
+  auto *value_weight = ctx.Input<Tensor>("ValueWeight");
+  auto *value_weight_grad =
+      ctx.Output<Tensor>(framework::GradVarName("ValueWeight"));
+  value_weight_grad->mutable_data<T>(ctx.GetPlace());
+
+  kv_compute.ComputeBackward(key, value_weight, value_out_grad, key_grad,
+                             value_weight_grad, nullptr, true);
+
+  // Gradient of GEMM(query, query_weight)
+  const auto *query_weight = ctx.Input<Tensor>("QueryWeight");
+  auto *query_weight_grad =
+      ctx.Output<Tensor>(framework::GradVarName("QueryWeight"));
+  query_weight_grad->mutable_data<T>(ctx.GetPlace());
+
+  int q_m = config.batch_size * config.seq_len_m * config.seq_len_r;
+  int q_n = config.num_heads * config.key_dim;
+  int q_k = config.q_dim;
+  auto q_compute = AttnMatMul<T>(ctx.cuda_device_context(), false, false, q_m,
+                                 q_n, q_k, false);
+  q_compute.ComputeBackward(query, query_weight, query_out_grad, query_grad,
+                            query_weight_grad, nullptr, use_addto);
+  return query_grad;
+}
+
+template <typename T>
+Tensor *ComputeGatingLinearForward(const framework::ExecutionContext &ctx,
+                                   const GateAttentionConfig<T> &config,
+                                   const Tensor *query,
+                                   const Tensor *fmha_out) {
+  auto *gate_weight = ctx.Input<Tensor>("GateWeight");
+  auto *gate_bias = ctx.Input<Tensor>("GateBias");
+
+  auto *gate_out = ctx.Output<Tensor>("GateOut");
+  gate_out->mutable_data<T>(ctx.GetPlace());
+  VLOG(4) << "[ComputeGatingLinearForward] gate_out: "
+          << MemoryDebugString(*gate_out);
+
+  // The first gate_bias_out stores the result of the multiplication,
+  // and the second gate_bias_out stores the result of the multiplication +
+  // bias.
+  //   gate_out = GEMM(query, gate_weight) + gate_bias
+  int m = config.batch_size * config.seq_len_m * config.seq_len_r;
+  int n = config.num_heads * config.key_dim;
+  int k = config.q_dim;
+  auto gate_attn_compute =
+      AttnMatMul<T>(ctx.cuda_device_context(), false, false, m, n, k, true);
+  gate_attn_compute.ComputeForward(gate_weight, query, gate_bias, gate_out,
+                                   gate_out);
+
+  // gate_out = sigmoid(gate_out) * fmha_out
+  std::vector<const Tensor *> ins = {gate_out, fmha_out};
+  std::vector<Tensor *> outs = {gate_out};
+  phi::funcs::ElementwiseKernel<T>(ctx.cuda_device_context(), ins, &outs,
+                                   SigmoidMultiplyFunctor<T>());
+  return gate_out;
+}
+
+template <typename T>
+Tensor *ComputeGatingLinearBackward(const framework::ExecutionContext &ctx,
+                                    const GateAttentionGradConfig<T> &config,
+                                    const Tensor *fmha_out,
+                                    const Tensor *gate_out_grad,
+                                    Tensor *query_grad, Tensor *fmha_out_grad) {
+  const auto *query = ctx.Input<Tensor>("Query");
+  const auto *gate_weight = ctx.Input<Tensor>("GateWeight");
+  const auto *gate_bias = ctx.Input<Tensor>("GateBias");
+
+  // Re-compute gate_bias_out
+  Tensor gate_bias_out;
+  gate_bias_out.Resize(config.gate_out_dims);
+  gate_bias_out.mutable_data<T>(ctx.GetPlace());
+
+  int m = config.batch_size * config.seq_len_m * config.seq_len_r;
+  int n = config.num_heads * config.key_dim;
+  int k = config.q_dim;
+  auto gate_attn_compute =
+      AttnMatMul<T>(ctx.cuda_device_context(), false, false, m, n, k, true);
+  gate_attn_compute.ComputeForward(gate_weight, query, gate_bias,
+                                   &gate_bias_out, &gate_bias_out);
+
+  // Gradient of sigmoid(gate_bias_out) * fmha_out
+  // Compute inplace and save gate_bias_out_grad to gate_bias_out.
+  std::vector<const Tensor *> ins = {gate_out_grad, &gate_bias_out, fmha_out};
+  std::vector<Tensor *> outs = {&gate_bias_out, fmha_out_grad};
+  phi::funcs::ElementwiseKernel<T, SigmoidMultiplyGradFunctor<T>, 2>(
+      ctx.cuda_device_context(), ins, &outs, SigmoidMultiplyGradFunctor<T>());
+
+  // Gradient of GEMM(query, gate_weight) + gate_bias
+  auto *gate_weight_grad =
+      ctx.Output<Tensor>(framework::GradVarName("GateWeight"));
+  auto *gate_bias_grad = ctx.Output<Tensor>(framework::GradVarName("GateBias"));
+  gate_weight_grad->mutable_data<T>(ctx.GetPlace());
+  gate_bias_grad->mutable_data<T>(ctx.GetPlace());
+
+  gate_attn_compute.ComputeBackward(query, gate_weight, &gate_bias_out,
+                                    query_grad, gate_weight_grad,
+                                    gate_bias_grad);
+  return fmha_out_grad;
+}
+
+template <typename T>
+Tensor *ComputeOutputLinearForward(const framework::ExecutionContext &ctx,
+                                   const GateAttentionConfig<T> &config,
+                                   const Tensor *fmha_or_gate_out) {
+  const auto *out_linear_weight = ctx.Input<Tensor>("OutLinearWeight");
+  const auto *out_linear_bias = ctx.Input<Tensor>("OutLinearBias");
+
+  auto *out = ctx.Output<Tensor>("Out");
+  out->mutable_data<T>(ctx.GetPlace());
+  VLOG(4) << "[ComputeOutputLinearForward] out: " << MemoryDebugString(*out);
+
+  // out = GEMM(fmha_or_gate_out, out_linear_weight) + out_linear_bias
+  int m = config.batch_size * config.seq_len_m * config.seq_len_r;
+  int n = config.q_dim;
+  int k = config.num_heads * config.key_dim;
+  auto out_linear_compute =
+      AttnMatMul<T>(ctx.cuda_device_context(), false, false, m, n, k, true);
+  out_linear_compute.ComputeForward(out_linear_weight, fmha_or_gate_out,
+                                    out_linear_bias, out, out);
+  return out;
+}
+
+template <typename T>
+Tensor *ComputeOutputLinearBackward(const framework::ExecutionContext &ctx,
+                                    const GateAttentionGradConfig<T> &config,
+                                    bool has_gating) {
+  std::string input_name = has_gating ? "GateOut" : "FMHAOut";
+
+  const auto *out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+  const auto *out_linear_weight = ctx.Input<Tensor>("OutLinearWeight");
+  const auto *input = ctx.Input<Tensor>(input_name);
+
+  auto *out_linear_weight_grad =
+      ctx.Output<Tensor>(framework::GradVarName("OutLinearWeight"));
+  auto *out_linear_bias_grad =
+      ctx.Output<Tensor>(framework::GradVarName("OutLinearBias"));
+  auto *input_grad = ctx.Output<Tensor>(framework::GradVarName(input_name));
+
+  out_linear_weight_grad->mutable_data<T>(ctx.GetPlace());
+  out_linear_bias_grad->mutable_data<T>(ctx.GetPlace());
+  input_grad->mutable_data<T>(ctx.GetPlace());
+
+  int m = config.batch_size * config.seq_len_m * config.seq_len_r;
+  int n = config.q_dim;
+  int k = config.num_heads * config.key_dim;
+  auto out_linear_compute =
+      AttnMatMul<T>(ctx.cuda_device_context(), false, false, m, n, k, true);
+  out_linear_compute.ComputeBackward(input, out_linear_weight, out_grad,
+                                     input_grad, out_linear_weight_grad,
+                                     out_linear_bias_grad);
+  return input_grad;
+}
+
+template <typename T>
+class FusedGateAttentionOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *query = ctx.Input<Tensor>("Query");
+    const auto *key = ctx.Input<Tensor>("Key");
+    const auto *query_weight = ctx.Input<Tensor>("QueryWeight");
+    const auto *qkv_weight = ctx.Input<Tensor>("QKVWeight");
+
+    const auto *src_mask = ctx.Input<Tensor>("SrcMask");
+    const auto *nonbatched_bias = ctx.Input<Tensor>("NonbatchedBias");
+
+    auto *q_transpose_out = ctx.Output<Tensor>("QueryTransposeOut");
+    auto *k_transpose_out = ctx.Output<Tensor>("KeyTransposeOut");
+    auto *v_transpose_out = ctx.Output<Tensor>("ValueTransposeOut");
+    auto *qkv_transpose_out = ctx.Output<Tensor>("QKVTransposeOut");
+
+    auto *softmax_out = ctx.Output<Tensor>("SoftmaxOut");
+    auto *fmha_out = ctx.Output<Tensor>("FMHAOut");
+
+    const bool merge_qkv = ctx.Attr<bool>("merge_qkv");
+    const bool has_gating = ctx.Attr<bool>("has_gating");
+
+    // When seq_len_r = m_size, q_dim = kv_dim, QKV matmul can be merged.
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    GateAttentionConfig<T> config(query, key, query_weight, qkv_weight,
+                                  merge_qkv);
+
+    if (merge_qkv) {
+      // 1. Merged QKV Matmul: einsum(nbhqk,nbkhc -> nbqhc)
+      Tensor *qkv_out = config.GetQKVOut(dev_ctx);
+      ComputeMergedQKVMatmulForward<T>(ctx, config, query, qkv_out);
+
+      qkv_transpose_out->mutable_data<T>(ctx.GetPlace());
+      VLOG(4) << "qkv_transpose_out:" << MemoryDebugString(*qkv_transpose_out);
+    } else {
+      // 1. Separated QKV Matmul
+      Tensor *query_out = config.GetQueryOut(dev_ctx);
+      Tensor *key_out = config.GetKeyOut(dev_ctx);
+      Tensor *value_out = config.GetValueOut(dev_ctx);
+      ComputeSeparatedQKVMatmulForward<T>(ctx, config, query, key, query_out,
+                                          key_out, value_out);
+
+      q_transpose_out->mutable_data<T>(ctx.GetPlace());
+      k_transpose_out->mutable_data<T>(ctx.GetPlace());
+      v_transpose_out->mutable_data<T>(ctx.GetPlace());
+      VLOG(4) << "q_transpose_out: " << MemoryDebugString(*q_transpose_out);
+      VLOG(4) << "k_transpose_out: " << MemoryDebugString(*k_transpose_out);
+      VLOG(4) << "v_transpose_out: " << MemoryDebugString(*v_transpose_out);
+    }
+
+    softmax_out->mutable_data<T>(ctx.GetPlace());
+    fmha_out->mutable_data<T>(ctx.GetPlace());
+    VLOG(4) << "softmax_out: " << MemoryDebugString(*softmax_out);
+    VLOG(4) << "fmha_out: " << MemoryDebugString(*fmha_out);
+
+    // 2. FMHA
+    auto fmha_compute = FMHAGateRef<T>(dev_ctx, merge_qkv);
+    fmha_compute.ComputeForward(
+        nonbatched_bias, src_mask, q_transpose_out, k_transpose_out,
+        v_transpose_out, qkv_transpose_out, softmax_out, fmha_out, &config);
+
+    // 3. Gating Linear
+    Tensor *fmha_or_gate_out =
+        !has_gating ? fmha_out : ComputeGatingLinearForward<T>(ctx, config,
+                                                               query, fmha_out);
+
+    // 4. Output Linear
+    ComputeOutputLinearForward<T>(ctx, config, fmha_or_gate_out);
+  }
+};
+
+template <typename T>
+class FusedGateAttentionGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto has_gating = ctx.Attr<bool>("has_gating");
+    const auto merge_qkv = ctx.Attr<bool>("merge_qkv");
+
+    // forward input
+    const auto *query = ctx.Input<Tensor>("Query");
+    const auto *key = ctx.Input<Tensor>("Key");
+    const auto *query_weight = ctx.Input<Tensor>("QueryWeight");
+    const auto *qkv_weight = ctx.Input<Tensor>("QKVWeight");
+
+    // forward output, backward input
+    const auto *q_transpose_out = ctx.Input<Tensor>("QueryTransposeOut");
+    const auto *k_transpose_out = ctx.Input<Tensor>("KeyTransposeOut");
+    const auto *v_transpose_out = ctx.Input<Tensor>("ValueTransposeOut");
+    const auto *qkv_transpose_out = ctx.Input<Tensor>("QKVTransposeOut");
+    const auto *softmax_out = ctx.Input<Tensor>("SoftmaxOut");
+    const auto *fmha_out = ctx.Input<Tensor>("FMHAOut");
+
+    // backward output
+    auto *query_grad = ctx.Output<Tensor>(framework::GradVarName("Query"));
+    query_grad->mutable_data<T>(ctx.GetPlace());
+    auto *nonbatched_bias_grad =
+        ctx.Output<Tensor>(framework::GradVarName("NonbatchedBias"));
+    auto *fmha_out_grad = ctx.Output<Tensor>(framework::GradVarName("FMHAOut"));
+
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    GateAttentionGradConfig<T> config(query, key, query_weight, qkv_weight,
+                                      merge_qkv);
+
+    // 1. Gradient of Output Linear
+    Tensor *fhma_or_gate_out_grad =
+        ComputeOutputLinearBackward<T>(ctx, config, has_gating);
+
+    // 2. Gradient of Gating Linear
+    if (has_gating) {
+      // fhma_or_gate_out_grad is actually gate_out_grad.
+      fmha_out_grad->mutable_data<T>(ctx.GetPlace());
+      ComputeGatingLinearBackward<T>(ctx, config, fmha_out,
+                                     fhma_or_gate_out_grad, query_grad,
+                                     fmha_out_grad);
+    }
+
+    // 3. Gradient of FMHA
+    if (nonbatched_bias_grad) {
+      nonbatched_bias_grad->mutable_data<T>(ctx.GetPlace());
+    }
+
+    auto fmha_compute = FMHAGateRef<T>(dev_ctx, merge_qkv);
+    fmha_compute.ComputeBackward(
+        q_transpose_out, k_transpose_out, v_transpose_out, qkv_transpose_out,
+        softmax_out, fmha_out_grad, nullptr, nonbatched_bias_grad, &config);
+
+    bool use_addto = has_gating ? true : false;
+    if (merge_qkv) {
+      // 4. Gradient of Merged QKV Matmul
+      Tensor *qkv_out_grad = config.GetQKVOutGrad(dev_ctx);
+      ComputeMergedQKVMatmulBackward<T>(ctx, config, query, qkv_out_grad,
+                                        query_grad, use_addto);
+    } else {
+      // 4. Gradient of Separated QKV Matmul
+      auto *key_grad = ctx.Output<Tensor>(framework::GradVarName("Key"));
+      if (key_grad) {
+        key_grad->mutable_data<T>(ctx.GetPlace());
+      }
+      Tensor *query_out_grad = config.GetQueryOutGrad(dev_ctx);
+      Tensor *key_out_grad = config.GetKeyOutGrad(dev_ctx);
+      Tensor *value_out_grad = config.GetValueOutGrad(dev_ctx);
+      ComputeSeparatedQKVMatmulBackward<T>(
+          ctx, config, query, key, query_out_grad, key_out_grad, value_out_grad,
+          query_grad, key_grad, use_addto);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+#ifdef PADDLE_WITH_HIP
+REGISTER_OP_CUDA_KERNEL(fused_gate_attention,
+                        ops::FusedGateAttentionOpKernel<float>,
+                        ops::FusedGateAttentionOpKernel<plat::float16>,
+                        ops::FusedGateAttentionOpKernel<plat::bfloat16>);
+REGISTER_OP_CUDA_KERNEL(fused_gate_attention_grad,
+                        ops::FusedGateAttentionGradKernel<float>,
+                        ops::FusedGateAttentionGradKernel<plat::float16>,
+                        ops::FusedGateAttentionGradKernel<plat::bfloat16>);
+#else
+REGISTER_OP_CUDA_KERNEL(fused_gate_attention,
+                        ops::FusedGateAttentionOpKernel<float>,
+                        ops::FusedGateAttentionOpKernel<double>,
+                        ops::FusedGateAttentionOpKernel<plat::float16>,
+                        ops::FusedGateAttentionOpKernel<plat::bfloat16>);
+REGISTER_OP_CUDA_KERNEL(fused_gate_attention_grad,
+                        ops::FusedGateAttentionGradKernel<float>,
+                        ops::FusedGateAttentionGradKernel<double>,
+                        ops::FusedGateAttentionGradKernel<plat::float16>,
+                        ops::FusedGateAttentionGradKernel<plat::bfloat16>);
+#endif
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cc b/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
index c95ca6fe0c96c..98602e4edd0a2 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
@@ -221,7 +221,7 @@ class FusedMultiTransformerOpOpMaker
                                 "'dropout_rate' must be between 0.0 and 1.0."));
         });
 
-    AddAttr<bool>("dropout_is_test",
+    AddAttr<bool>("is_test",
                   "(bool, default false) Set to true for inference only, false "
                   "for training. Some layers may run faster when this is true.")
         .SetDefault(false);
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
index fdd0208c3d316..fe93d323c59bc 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/operators/fused/attention_layer_norm.h"
diff --git a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
index acb94e20df8cb..bed5125b99583 100644
--- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
@@ -130,7 +130,7 @@ class FusionRepeatedFCReluKernel : public framework::OpKernel<T> {
     int weight_sz = static_cast<int>(weights.size());
 
     auto i_dims = in->dims();
-    auto w_dims = weights[0]->dims();
+    const auto& w_dims = weights[0]->dims();
     jit::matmul_attr_t attr;
     attr.m = i_dims[0];
     attr.n = w_dims[1];
@@ -140,8 +140,8 @@ class FusionRepeatedFCReluKernel : public framework::OpKernel<T> {
             relus[0]->mutable_data<T>(place), attr);
 
     for (int i = 1; i < weight_sz - 1; ++i) {
-      auto i_dims = relus[i - 1]->dims();
-      auto w_dims = weights[i]->dims();
+      const auto& i_dims = relus[i - 1]->dims();
+      const auto& w_dims = weights[i]->dims();
       attr.m = i_dims[0];
       attr.n = w_dims[1];
       attr.k = w_dims[0];
@@ -150,8 +150,8 @@ class FusionRepeatedFCReluKernel : public framework::OpKernel<T> {
               biases[i]->data<T>(), relus[i]->mutable_data<T>(place), attr);
     }
 
-    auto i_dims_last = relus[weight_sz - 2]->dims();
-    auto w_dims_last = weights[weight_sz - 1]->dims();
+    const auto& i_dims_last = relus[weight_sz - 2]->dims();
+    const auto& w_dims_last = weights[weight_sz - 1]->dims();
     attr.m = i_dims_last[0];
     attr.n = w_dims_last[1];
     attr.k = w_dims_last[0];
diff --git a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
index 91bc855d43c83..e574d67e3982c 100644
--- a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
@@ -91,8 +91,8 @@ class FusionSeqPoolConcatKernel : public framework::OpKernel<T> {
     auto* out = ctx.Output<LoDTensor>("Out");
     std::string pooltype = ctx.Attr<std::string>("pooltype");
     auto x0_lod = ins[0]->lod();
-    auto x0_dims = ins[0]->dims();
-    auto y_dims = out->dims();
+    const auto& x0_dims = ins[0]->dims();
+    const auto& y_dims = out->dims();
     size_t bs = x0_lod[0].size() - 1;
     out->Resize({static_cast<int64_t>(bs), y_dims[1]});
     framework::LoD y_lod(1);
@@ -122,7 +122,7 @@ class FusionSeqPoolConcatKernel : public framework::OpKernel<T> {
     size_t n = ins.size();
     size_t dst_step_size = n * w;
     for (size_t i = 0; i < n; ++i) {
-      auto x_dims = ins[i]->dims();
+      const auto& x_dims = ins[i]->dims();
       auto x_lod = ins[i]->lod()[0];
       const T* src = ins[i]->data<T>();
       T* dst = y_data + i * w;
diff --git a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
index 123c4c885ead8..c74cc504840d3 100644
--- a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
@@ -92,8 +92,8 @@ class FusionSeqPoolCVMConcatKernel : public framework::OpKernel<T> {
     auto* out = ctx.Output<LoDTensor>("Out");
     std::string pooltype = ctx.Attr<std::string>("pooltype");
     auto x0_lod = ins[0]->lod();
-    auto x0_dims = ins[0]->dims();
-    auto y_dims = out->dims();
+    const auto& x0_dims = ins[0]->dims();
+    const auto& y_dims = out->dims();
     size_t bs = x0_lod[0].size() - 1;
     out->Resize({static_cast<int64_t>(bs), y_dims[1]});
     framework::LoD y_lod(1);
@@ -121,7 +121,7 @@ class FusionSeqPoolCVMConcatKernel : public framework::OpKernel<T> {
     size_t n = ins.size();
     size_t dst_step_size = n * w;
     for (size_t i = 0; i < n; ++i) {
-      auto x_dims = ins[i]->dims();
+      const auto& x_dims = ins[i]->dims();
       auto x_lod = ins[i]->lod()[0];
       const T* src = ins[i]->data<T>();
       T* dst = y_data + i * w;
diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
index 552649279e911..deac932d59b80 100644
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -54,26 +54,21 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
     auto* tensor = context.Output<framework::Tensor>("Out");
     T* data = tensor->mutable_data<T>(context.GetPlace());
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    bool seed_flag = false;
-    if (seed == 0) {
-      std::random_device rd;
-      seed = rd();
-      seed_flag = true;
-    }
     T mean = static_cast<T>(context.Attr<float>("mean"));
     T std = static_cast<T>(context.Attr<float>("std"));
     int64_t size = tensor->numel();
 
     int device_id = context.GetPlace().GetDeviceId();
-    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+    auto gen_cuda = framework::DefaultCUDAGenerator(device_id);
     auto& dev_cxt =
         context.template device_context<platform::CUDADeviceContext>();
 
-    if (gen_cuda->GetIsInitPy() && seed_flag) {
+    if (seed == 0) {
+      // use global Generator seed
       auto seed_offset = gen_cuda->IncrementOffset(1);
-      int64_t gen_offset = size * seed_offset.second;
-      auto func = GaussianGenerator<T>(mean, std, seed_offset.first,
-                                       seed_offset.second);
+      uint64_t seed = seed_offset.first;
+      uint64_t offset = seed_offset.second;
+      auto func = GaussianGenerator<T>(mean, std, seed, size * offset);
       phi::IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
     } else {
       auto func = GaussianGenerator<T>(mean, std, seed);
diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc
index 344b104b5948c..d420d0319bfe4 100644
--- a/paddle/fluid/operators/inplace_abn_op.cc
+++ b/paddle/fluid/operators/inplace_abn_op.cc
@@ -292,9 +292,9 @@ class InplaceABNGradKernel : public framework::OpKernel<T> {
     auto* mean = ctx.Input<Tensor>("ReserveSpace");
     auto* variance = ctx.Input<Tensor>("ReserveSpace");
 
-    paddle::optional<const Tensor&> space_opt = paddle::none;
-    paddle::optional<const Tensor&> mean_opt = paddle::none;
-    paddle::optional<const Tensor&> variance_opt = paddle::none;
+    paddle::optional<Tensor> space_opt;
+    paddle::optional<Tensor> mean_opt;
+    paddle::optional<Tensor> variance_opt;
 
     if (reserve_space != nullptr) {
       space_opt = *reserve_space;
diff --git a/paddle/fluid/operators/inplace_abn_op.cu b/paddle/fluid/operators/inplace_abn_op.cu
index 6c16210ced022..6476023fcd20e 100644
--- a/paddle/fluid/operators/inplace_abn_op.cu
+++ b/paddle/fluid/operators/inplace_abn_op.cu
@@ -120,9 +120,9 @@ class InplaceABNGradKernel
       auto* mean = ctx.Input<Tensor>("ReserveSpace");
       auto* variance = ctx.Input<Tensor>("ReserveSpace");
 
-      paddle::optional<const Tensor&> space_opt = paddle::none;
-      paddle::optional<const Tensor&> mean_opt = paddle::none;
-      paddle::optional<const Tensor&> variance_opt = paddle::none;
+      paddle::optional<Tensor> space_opt;
+      paddle::optional<Tensor> mean_opt;
+      paddle::optional<Tensor> variance_opt;
 
       if (reserve_space != nullptr) {
         space_opt = *reserve_space;
diff --git a/paddle/fluid/operators/instance_norm_op.cc b/paddle/fluid/operators/instance_norm_op.cc
index a7d96437e95c4..de92de453a354 100644
--- a/paddle/fluid/operators/instance_norm_op.cc
+++ b/paddle/fluid/operators/instance_norm_op.cc
@@ -17,93 +17,16 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/ternary.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
 
-void InstanceNormOp::InferShape(framework::InferShapeContext *ctx) const {
-  OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "InstanceNorm");
-  OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "InstanceNorm");
-  OP_INOUT_CHECK(ctx->HasOutput("SavedMean"), "Output", "SavedMean",
-                 "InstanceNorm");
-  OP_INOUT_CHECK(ctx->HasOutput("SavedVariance"), "Output", "SavedVariance",
-                 "InstanceNorm");
-
-  const auto x_dims = ctx->GetInputDim("X");
-  PADDLE_ENFORCE_NE(phi::product(x_dims), 0,
-                    platform::errors::PreconditionNotMet(
-                        "The Input variable X(%s) has not "
-                        "been initialized. You may need to confirm "
-                        "if you put exe.run(startup_program) "
-                        "after optimizer.minimize function.",
-                        ctx->Inputs("X").front()));
-  PADDLE_ENFORCE_GE(
-      x_dims.size(), 2,
-      platform::errors::InvalidArgument(
-          "ShapeError: the dimension of input X must "
-          "greater than or equal to 2. But received: the shape of input "
-          "X = [%s], the dimension of input X =[%d]",
-          x_dims, x_dims.size()));
-  PADDLE_ENFORCE_LE(
-      x_dims.size(), 5,
-      platform::errors::InvalidArgument(
-          "ShapeError: the dimension of input X must "
-          "smaller than or equal to 5, But received: the shape of input "
-          "X = [%s], the dimension of input X = [%d]",
-          x_dims, x_dims.size()));
-  auto N = x_dims[0];
-  auto C = x_dims[1];
-  auto NxC = N * C;
-
-  if (ctx->HasInput("Scale")) {
-    auto scale_dim = ctx->GetInputDim("Scale");
-
-    PADDLE_ENFORCE_EQ(
-        scale_dim.size(), 1UL,
-        platform::errors::InvalidArgument(
-            "ShapeError: the dimension of scale must equal to 1."
-            "But received: the shape of scale is [%s], the dimension "
-            "of scale is [%d]",
-            scale_dim, scale_dim.size()));
-
-    bool check = !((!ctx->IsRuntime()) && (phi::product(scale_dim) <= 0));
-
-    if (check) {
-      PADDLE_ENFORCE_EQ(scale_dim[0], C,
-                        platform::errors::InvalidArgument(
-                            "ShapeError: the shape of scale must equal to [%d]"
-                            "But received: the shape of scale is [%d]",
-                            C, scale_dim[0]));
-    }
-  }
-  if (ctx->HasInput("Bias")) {
-    auto bias_dim = ctx->GetInputDim("Bias");
-    PADDLE_ENFORCE_EQ(
-        bias_dim.size(), 1UL,
-        platform::errors::InvalidArgument(
-            "ShapeError: the dimension of bias must equal to 1."
-            "But received: the shape of bias is [%s],the dimension "
-            "of bias is [%d]",
-            bias_dim, bias_dim.size()));
-
-    bool check = !((!ctx->IsRuntime()) && (phi::product(bias_dim) <= 0));
-    if (check) {
-      PADDLE_ENFORCE_EQ(bias_dim[0], C,
-                        platform::errors::InvalidArgument(
-                            "ShapeError: the shape of bias must equal to [%d]"
-                            "But received: the shape of bias is [%d]",
-                            C, bias_dim[0]));
-    }
-  }
-
-  ctx->SetOutputDim("Y", x_dims);
-  ctx->SetOutputDim("SavedMean", {NxC});
-  ctx->SetOutputDim("SavedVariance", {NxC});
-  ctx->ShareLoD("X", "Y");
-}
-
 framework::OpKernelType InstanceNormOp::GetExpectedKernelType(
     const framework::ExecutionContext &ctx) const {
   auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
@@ -170,127 +93,6 @@ NCHW `[batch, in_channels, in_height, in_width]`
 )DOC");
 }
 
-template <typename T>
-class InstanceNormKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto &x_dims = x->dims();
-
-    const int N = x_dims[0];
-    const int C = x_dims[1];
-    const int NxC = N * C;
-
-    const int sample_size = x->numel() / N / C;
-
-    auto *y = ctx.Output<Tensor>("Y");
-    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
-    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
-
-    auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-    auto *place = dev_ctx.eigen_device();
-
-    Eigen::DSizes<int, 2> shape(NxC, sample_size);
-// Once eigen on Windows is updated, the if branch can be removed.
-#ifndef EIGEN_HAS_INDEX_LIST
-    Eigen::DSizes<int, 2> bcast(1, sample_size);
-    Eigen::DSizes<int, 2> C_shape(C, 1);
-    Eigen::DSizes<int, 2> NxC_shape(NxC, 1);
-    Eigen::DSizes<int, 1> rdims(1);
-#else
-    Eigen::IndexList<Eigen::type2index<1>, int> bcast;
-    bcast.set(1, sample_size);
-    Eigen::IndexList<int, Eigen::type2index<1>> C_shape;
-    C_shape.set(0, C);
-    Eigen::IndexList<int, Eigen::type2index<1>> NxC_shape;
-    NxC_shape.set(0, NxC);
-    Eigen::IndexList<Eigen::type2index<1>> rdims;
-#endif
-
-    phi::funcs::SetConstant<platform::CPUDeviceContext, T> set_constant;
-
-    saved_mean->mutable_data<T>(ctx.GetPlace());
-    saved_variance->mutable_data<T>(ctx.GetPlace());
-    set_constant(dev_ctx, saved_mean, static_cast<T>(0));
-    set_constant(dev_ctx, saved_variance, static_cast<T>(0));
-
-    auto saved_mean_a = framework::EigenVector<T>::Flatten(*saved_mean);
-    auto saved_mean_e = saved_mean_a.reshape(NxC_shape);
-    auto saved_variance_a = framework::EigenVector<T>::Flatten(*saved_variance);
-    auto saved_variance_e = saved_variance_a.reshape(NxC_shape);
-
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto x_arr = x_e.reshape(shape);
-
-    saved_mean_e.device(*place) = x_arr.mean(rdims);
-    auto saved_variance_arr =
-        (x_arr - saved_mean_e.broadcast(bcast)).square().mean(rdims) + epsilon;
-
-    saved_variance_e.device(*place) = saved_variance_arr.sqrt().inverse();
-
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-
-    Tensor scale_data;
-    Tensor bias_data;
-    if (!scale) {
-      scale_data.mutable_data<T>({C}, ctx.GetPlace());
-      set_constant(dev_ctx, &scale_data, static_cast<T>(1));
-    }
-
-    if (!bias) {
-      bias_data.mutable_data<T>({C}, ctx.GetPlace());
-      set_constant(dev_ctx, &bias_data, static_cast<T>(0));
-    }
-    auto scale_e = scale
-                       ? framework::EigenVector<T>::Flatten(*scale)
-                       : framework::EigenVector<T>::Flatten(
-                             const_cast<const framework::Tensor &>(scale_data));
-    auto scale_arr = scale_e.reshape(C_shape);
-    auto bias_e = bias ? framework::EigenVector<T>::Flatten(*bias)
-                       : framework::EigenVector<T>::Flatten(
-                             const_cast<const framework::Tensor &>(bias_data));
-    auto bias_arr = bias_e.reshape(C_shape);
-
-    y->mutable_data<T>(ctx.GetPlace());
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto y_arr = y_e.reshape(shape);
-
-    // (x - mean) * inv_std * scale + bias
-    Eigen::DSizes<int, 2> bcast_param(N, sample_size);
-    y_arr.device(*place) = (x_arr - saved_mean_e.broadcast(bcast)) *
-                               saved_variance_e.broadcast(bcast) *
-                               scale_arr.broadcast(bcast_param) +
-                           bias_arr.broadcast(bcast_param);
-  }
-};
-
-void InstanceNormGradOp::InferShape(framework::InferShapeContext *ctx) const {
-  OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "InstanceNormGrad");
-  OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")), "Input",
-                 framework::GradVarName("Y"), "InstanceNormGrad");
-  OP_INOUT_CHECK(ctx->HasInput("SavedMean"), "Input", "SavedMean",
-                 "InstanceNormGrad");
-  OP_INOUT_CHECK(ctx->HasInput("SavedVariance"), "Input", "SavedVariance",
-                 "InstanceNormGrad");
-
-  // check output
-  OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output",
-                 framework::GradVarName("X"), "InstanceNormGrad");
-  const auto x_dims = ctx->GetInputDim("X");
-  const int C = x_dims[1];
-  ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-  if (ctx->HasOutput(framework::GradVarName("Scale"))) {
-    ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
-  }
-  if (ctx->HasOutput(framework::GradVarName("Bias"))) {
-    ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
-  }
-}
-
 framework::OpKernelType InstanceNormGradOp::GetExpectedKernelType(
     const framework::ExecutionContext &ctx) const {
   const auto *var = ctx.InputVar(framework::GradVarName("Y"));
@@ -312,148 +114,6 @@ framework::OpKernelType InstanceNormGradOp::GetExpectedKernelType(
       OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
 }
 
-template <typename T>
-class InstanceNormGradKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *saved_inv_variance = ctx.Input<Tensor>("SavedVariance");
-
-    const auto &x_dims = x->dims();
-
-    const int N = x_dims[0];
-    const int C = x_dims[1];
-    const int NxC = N * C;
-    const int sample_size = x->numel() / N / C;
-
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-    d_x->mutable_data<T>(ctx.GetPlace());
-
-    auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-    auto *place = dev_ctx.eigen_device();
-
-    Eigen::DSizes<int, 2> rshape(NxC, sample_size);
-    Eigen::DSizes<int, 2> param_shape(N, C);
-    Eigen::DSizes<int, 2> shape(NxC, sample_size);
-#ifndef EIGEN_HAS_INDEX_LIST
-    Eigen::DSizes<int, 1> rdims(0);
-    Eigen::DSizes<int, 1> mean_rdims(1);
-    Eigen::DSizes<int, 2> bcast(1, sample_size);
-    Eigen::DSizes<int, 2> C_shape(C, 1);
-    Eigen::DSizes<int, 2> NxC_shape(NxC, 1);
-#else
-    Eigen::IndexList<Eigen::type2index<0>> rdims;
-    Eigen::IndexList<Eigen::type2index<1>> mean_rdims;
-    Eigen::IndexList<Eigen::type2index<1>, int> bcast;
-    bcast.set(1, sample_size);
-    Eigen::IndexList<int, Eigen::type2index<1>> C_shape;
-    C_shape.set(0, C);
-    Eigen::IndexList<int, Eigen::type2index<1>> NxC_shape;
-    NxC_shape.set(0, NxC);
-#endif
-
-    phi::funcs::SetConstant<platform::CPUDeviceContext, T> set_constant;
-
-    Tensor scale_data;
-    if (!scale) {
-      scale_data.mutable_data<T>({C}, ctx.GetPlace());
-      set_constant(dev_ctx, &scale_data, static_cast<T>(1));
-    }
-
-    auto scale_e = scale
-                       ? framework::EigenVector<T>::Flatten(*scale)
-                       : framework::EigenVector<T>::Flatten(
-                             const_cast<const framework::Tensor &>(scale_data));
-    auto mean_e = framework::EigenVector<T>::Flatten(*saved_mean);
-    auto inv_var_e = framework::EigenVector<T>::Flatten(*saved_inv_variance);
-    auto dy_e = framework::EigenVector<T>::Flatten(*d_y);
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-
-    auto scale_arr = scale_e.reshape(C_shape);
-    auto mean_arr = mean_e.reshape(NxC_shape);
-    auto inv_var_arr = inv_var_e.reshape(NxC_shape);
-    auto dy_arr = dy_e.reshape(shape);
-    auto x_arr = x_e.reshape(shape);
-
-    auto tmp = (x_arr - mean_arr.eval().broadcast(bcast)) *
-               inv_var_arr.eval().broadcast(bcast);
-
-    // math: d_bias = np.sum(d_y, axis=(n,h,w))
-    // math: d_scale = np.sum((X-mean) / inv_std * dy, axis=(n, h,w))
-    if (d_scale && d_bias) {
-      d_scale->mutable_data<T>(ctx.GetPlace());
-      d_bias->mutable_data<T>(ctx.GetPlace());
-      set_constant(dev_ctx, d_scale, static_cast<T>(0));
-      set_constant(dev_ctx, d_bias, static_cast<T>(0));
-
-      auto d_scale_e = framework::EigenVector<T>::Flatten(*d_scale);
-      auto d_scale_data = d_scale_e.reshape(C_shape);
-      auto d_bias_e = framework::EigenVector<T>::Flatten(*d_bias);
-      auto d_bias_data = d_bias_e.reshape(C_shape);
-      d_bias_data.device(*place) =
-          dy_arr.sum(mean_rdims).reshape(param_shape).sum(rdims);
-      d_scale_data.device(*place) =
-          (tmp * dy_arr).sum(mean_rdims).reshape(param_shape).sum(rdims);
-    }
-
-    auto dy_mean =
-        dy_arr.mean(mean_rdims).reshape(NxC_shape).eval().broadcast(bcast);
-
-    Eigen::DSizes<int, 2> bcast_param(N, sample_size);
-    set_constant(dev_ctx, d_x, static_cast<T>(0));
-    // math: d_x = scale * inv_var * d_y - scale * inv_var * np.sum(d_y,
-    // axis=(h,w))
-    //             - scale * (X - mean) * inv_var.pow(3) * np.sum(d_y * (X -
-    //             mean),
-    //             axis=(h,w))
-    auto dx_e = framework::EigenVector<T>::Flatten(*d_x);
-    auto dx_arr = dx_e.reshape(shape);
-    dx_arr.device(*place) = scale_arr.broadcast(bcast_param) *
-                            inv_var_arr.broadcast(bcast) *
-                            (dy_arr - dy_mean -
-                             tmp *
-                                 (dy_arr * tmp)
-                                     .mean(mean_rdims)
-                                     .reshape(NxC_shape)
-                                     .eval()
-                                     .broadcast(bcast));
-  }
-};
-
-void InstanceNormDoubleGradOp::InferShape(
-    framework::InferShapeContext *ctx) const {
-  OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "InstanceNormDoubleGrad");
-  OP_INOUT_CHECK(ctx->HasInput("SavedMean"), "Input", "SavedMean",
-                 "InstanceNormDoubleGrad");
-  OP_INOUT_CHECK(ctx->HasInput("SavedVariance"), "Input", "SavedVariance",
-                 "InstanceNormDoubleGrad");
-  OP_INOUT_CHECK(ctx->HasInput("DDX"), "Input", "DDX",
-                 "InstanceNormDoubleGrad");
-  OP_INOUT_CHECK(ctx->HasInput("DY"), "Input", "DY", "InstanceNormDoubleGrad");
-
-  // check output
-  OP_INOUT_CHECK(ctx->HasOutput("DX"), "Output", "DX",
-                 "InstanceNormDoubleGrad");
-
-  const auto x_dims = ctx->GetInputDim("X");
-  const int C = x_dims[1];
-  if (ctx->HasOutput("DX")) {
-    ctx->SetOutputDim("DX", x_dims);
-  }
-  if (ctx->HasOutput("DScale")) {
-    ctx->SetOutputDim("DScale", {C});
-  }
-  if (ctx->HasOutput("DDY")) {
-    ctx->ShareDim("X", "DDY");
-  }
-}
-
 framework::OpKernelType InstanceNormDoubleGradOp::GetExpectedKernelType(
     const framework::ExecutionContext &ctx) const {
   const auto *var = ctx.InputVar("DY");
@@ -475,213 +135,6 @@ framework::OpKernelType InstanceNormDoubleGradOp::GetExpectedKernelType(
       OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
 }
 
-template <typename T>
-class InstanceNormDoubleGradKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *X = ctx.Input<Tensor>("X");
-    const auto *Scale = ctx.Input<Tensor>("Scale");
-    const auto *dY = ctx.Input<Tensor>("DY");
-    const auto *Saved_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *Saved_variance = ctx.Input<Tensor>("SavedVariance");
-    const auto *ddX = ctx.Input<Tensor>("DDX");
-    const auto *ddScale = ctx.Input<Tensor>("DDScale");
-    const auto *ddBias = ctx.Input<Tensor>("DDBias");
-
-    auto *dX = ctx.Output<Tensor>("DX");
-    auto *dScale = ctx.Output<Tensor>("DScale");
-    auto *ddY = ctx.Output<Tensor>("DDY");
-
-    auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-    phi::funcs::SetConstant<platform::CPUDeviceContext, T> set_constant;
-
-    const auto &x_dims = X->dims();
-    int N, C, H, W, D;
-    ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
-    const int sample_size = X->numel() / N / C;
-    const int NxC = N * C;
-
-    const T *mean_data = Saved_mean->data<T>();
-    const T *inv_var_data = Saved_variance->data<T>();
-    Tensor mean_tensor;
-    Tensor inv_var_tensor;
-    ConstEigenArrayMap<T> x_arr(X->data<T>(), sample_size, NxC);
-    ConstEigenVectorArrayMap<T> mean_arr(mean_data, NxC);
-    ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, NxC);
-
-    Tensor mean_tile;
-    mean_tile.Resize({sample_size, NxC});
-    mean_tile.mutable_data<T>(ctx.GetPlace());
-    EigenArrayMap<T> mean_tile_data(mean_tile.mutable_data<T>(ctx.GetPlace()),
-                                    sample_size, NxC);
-
-    Tensor inv_var_tile;
-    inv_var_tile.Resize({sample_size, NxC});
-    inv_var_tile.mutable_data<T>(ctx.GetPlace());
-    EigenArrayMap<T> inv_var_tile_data(
-        inv_var_tile.mutable_data<T>(ctx.GetPlace()), sample_size, NxC);
-
-    mean_tile_data = mean_arr.transpose().replicate(sample_size, 1);
-    inv_var_tile_data = inv_var_arr.transpose().replicate(sample_size, 1);
-
-    Tensor Scale_data;
-    if (!Scale) {
-      Scale_data.mutable_data<T>({C}, ctx.GetPlace());
-      set_constant(dev_ctx, &Scale_data, static_cast<T>(1));
-    }
-    ConstEigenVectorArrayMap<T> scale_arr(
-        Scale ? Scale->data<T>() : Scale_data.data<T>(), C);
-
-    Tensor scale_tile;
-    scale_tile.Resize({sample_size, NxC});
-    scale_tile.mutable_data<T>(ctx.GetPlace());
-    EigenArrayMap<T> scale_tile_data(scale_tile.mutable_data<T>(ctx.GetPlace()),
-                                     sample_size, NxC);
-    scale_tile_data = scale_arr.transpose().replicate(sample_size, N);
-
-    ConstEigenArrayMap<T> dy_arr(dY->data<T>(), sample_size, NxC);
-    ConstEigenArrayMap<T> ddx_arr(ddX->data<T>(), sample_size, NxC);
-
-    // math: dx = scale * ((x - mean) * inv_var / HxW * (np.mean(ddx,
-    // axis=(h,w)) *
-    //          np.sum(dy, axis=(h,w)) -
-    //          np.sum(dy * ddx, axis=(h,w)) + 3 * np.mean(dy * (x - mean),
-    //          axis=(h,w)) * inv_var.pow(2) *
-    //          np.sum(ddx * (x - mean), axis=(h,w))) + inv_var.pow(3) / HxW *
-    //          np.sum(ddx * (x - mean)) *
-    //          (np.mean(dy, axis=(h,w)) - dy) + inv_var.pow(3) / HxW *
-    //          np.sum(dy,
-    //          axis=(h,w)) * (x - mean) *
-    //          (np.mean(ddx, axis=(h,w)) - ddx)) + ddr * (dy * inv_var -
-    //          inv_var *
-    //          np.mean(dy, axis=(h,w)) -
-    //          inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
-    //          axis=(h,w)))
-
-    Tensor x_sub_mean_mul_invstd;
-    x_sub_mean_mul_invstd.Resize({sample_size, NxC});
-    x_sub_mean_mul_invstd.mutable_data<T>(ctx.GetPlace());
-    EigenArrayMap<T> x_sub_mean_mul_invstd_arr(
-        x_sub_mean_mul_invstd.mutable_data<T>(ctx.GetPlace()), sample_size,
-        NxC);
-    x_sub_mean_mul_invstd_arr = (x_arr - mean_tile_data) * inv_var_tile_data;
-
-    if (dX) {
-      dX->mutable_data<T>(ctx.GetPlace());
-      set_constant(dev_ctx, dX, static_cast<T>(0));
-      EigenArrayMap<T> dx_arr(dX->mutable_data<T>(ctx.GetPlace()), sample_size,
-                              NxC);
-
-      if (ddX) {
-        dx_arr +=
-            x_sub_mean_mul_invstd_arr * inv_var_tile_data * inv_var_tile_data /
-            sample_size *
-            (ddx_arr.colwise().sum() * dy_arr.colwise().sum() / sample_size -
-             (dy_arr * ddx_arr).colwise().sum() +
-             3. * (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() *
-                 (ddx_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
-                 sample_size);
-
-        dx_arr += (ddx_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
-                  sample_size * inv_var_tile_data * inv_var_tile_data *
-                  (dy_arr.colwise().sum() / sample_size - dy_arr);
-
-        dx_arr += (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
-                  sample_size * inv_var_tile_data * inv_var_tile_data *
-                  (ddx_arr.colwise().sum() / sample_size - ddx_arr);
-
-        dx_arr = scale_tile_data * dx_arr;
-      }
-      if (ddScale) {
-        ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
-
-        Tensor ddscale_tile;
-        ddscale_tile.Resize({sample_size, NxC});
-        ddscale_tile.mutable_data<T>(ctx.GetPlace());
-        EigenArrayMap<T> ddscale_tile_data(
-            ddscale_tile.mutable_data<T>(ctx.GetPlace()), sample_size, NxC);
-        ddscale_tile_data = ddscale_arr.transpose().replicate(sample_size, N);
-
-        dx_arr += (dy_arr * inv_var_tile_data -
-                   dy_arr.colwise().sum() / sample_size * inv_var_tile_data -
-                   x_sub_mean_mul_invstd_arr * inv_var_tile_data *
-                       (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
-                       sample_size) *
-                  ddscale_tile_data;
-      }
-    }
-    if (dScale) {
-      // math: dscale = inv_var * (dy - np.mean(dy, axis=(h,w) - (x-mean) *
-      //            inv_var.pow(2) * np.mean(dy * (x-mean), axis=(h,w)))) * ddx
-      dScale->mutable_data<T>(ctx.GetPlace());
-      set_constant(dev_ctx, dScale, static_cast<T>(0));
-      EigenVectorArrayMap<T> dscale_arr(dScale->mutable_data<T>(ctx.GetPlace()),
-                                        C);
-      if (ddX) {
-        Tensor first_grad;
-        first_grad.Resize({sample_size, NxC});
-        first_grad.mutable_data<T>(ctx.GetPlace());
-        set_constant(dev_ctx, &first_grad, static_cast<T>(0));
-        EigenArrayMap<T> first_grad_arr(
-            first_grad.mutable_data<T>(ctx.GetPlace()), sample_size, NxC);
-
-        first_grad_arr +=
-            inv_var_tile_data *
-            (dy_arr -
-             dy_arr.colwise().sum().replicate(sample_size, 1) / sample_size -
-             x_sub_mean_mul_invstd_arr *
-                 (dy_arr * x_sub_mean_mul_invstd_arr)
-                     .colwise()
-                     .sum()
-                     .replicate(sample_size, 1) /
-                 sample_size);
-        first_grad_arr = first_grad_arr * ddx_arr;
-        for (int nc = 0; nc < NxC; ++nc) {
-          int c = nc % C;
-          dscale_arr(c) += first_grad_arr.colwise().sum()(nc);
-        }
-      }
-    }
-    if (ddY) {
-      // math: ddy = (x - mean) * inv_var * ddscale + ddbias +
-      //           scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) *
-      //           np.mean(ddx * (x - mean), axis=(h,w)))
-      ddY->mutable_data<T>(ctx.GetPlace());
-      set_constant(dev_ctx, ddY, static_cast<T>(0));
-      EigenArrayMap<T> ddy_arr(ddY->mutable_data<T>(ctx.GetPlace()),
-                               sample_size, NxC);
-      if (ddX) {
-        ddy_arr += scale_tile_data * inv_var_tile_data *
-                   (ddx_arr - ddx_arr.colwise().sum() / sample_size -
-                    x_sub_mean_mul_invstd_arr *
-                        (ddx_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
-                        sample_size);
-      }
-      if (ddScale && ddBias) {
-        ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
-        Tensor ddscale_tile;
-        ddscale_tile.Resize({sample_size, NxC});
-        ddscale_tile.mutable_data<T>(ctx.GetPlace());
-        EigenArrayMap<T> ddscale_tile_data(
-            ddscale_tile.mutable_data<T>(ctx.GetPlace()), sample_size, NxC);
-        ddscale_tile_data = ddscale_arr.transpose().replicate(sample_size, N);
-
-        ConstEigenVectorArrayMap<T> ddbias_arr(ddBias->data<T>(), C);
-        Tensor ddbias_tile;
-        ddbias_tile.Resize({sample_size, NxC});
-        ddbias_tile.mutable_data<T>(ctx.GetPlace());
-        EigenArrayMap<T> ddbias_tile_data(
-            ddbias_tile.mutable_data<T>(ctx.GetPlace()), sample_size, NxC);
-        ddbias_tile_data = ddbias_arr.transpose().replicate(sample_size, N);
-
-        ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data;
-        ddy_arr += ddbias_tile_data;
-      }
-    }
-  }
-};
-
 DECLARE_INPLACE_OP_INFERER(InstanceNormDoubleGradOpInplaceInferer,
                            {"DY", "DDY"});
 
@@ -689,30 +142,26 @@ DECLARE_INPLACE_OP_INFERER(InstanceNormDoubleGradOpInplaceInferer,
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(instance_norm, InstanceNormInferShapeFunctor,
+                            PD_INFER_META(phi::InstanceNormInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(instance_norm_grad,
+                            InstanceNormGradInferShapeFunctor,
+                            PD_INFER_META(phi::InstanceNormGradInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(
+    instance_norm_grad_grad, InstanceNormDoubleGradInferShapeFunctor,
+    PD_INFER_META(phi::InstanceNormDoubleGradInferMeta));
 REGISTER_OPERATOR(instance_norm, ops::InstanceNormOp, ops::InstanceNormOpMaker,
                   ops::InstanceNormOpInferVarType,
                   ops::InstanceNormGradMaker<paddle::framework::OpDesc>,
-                  ops::InstanceNormGradMaker<paddle::imperative::OpBase>);
+                  ops::InstanceNormGradMaker<paddle::imperative::OpBase>,
+                  InstanceNormInferShapeFunctor);
 REGISTER_OPERATOR(instance_norm_grad, ops::InstanceNormGradOp,
                   ops::InstanceNormDoubleGradMaker<paddle::framework::OpDesc>,
-                  ops::InstanceNormDoubleGradMaker<paddle::imperative::OpBase>);
+                  ops::InstanceNormDoubleGradMaker<paddle::imperative::OpBase>,
+                  InstanceNormGradInferShapeFunctor);
 REGISTER_OPERATOR(instance_norm_grad_grad, ops::InstanceNormDoubleGradOp,
-                  ops::InstanceNormDoubleGradOpInplaceInferer);
-
-REGISTER_OP_CPU_KERNEL(
-    instance_norm,
-    ops::InstanceNormKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::InstanceNormKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    instance_norm_grad,
-    ops::InstanceNormGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::InstanceNormGradKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    instance_norm_grad_grad,
-    ops::InstanceNormDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                      float>,
-    ops::InstanceNormDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                      double>);
+                  ops::InstanceNormDoubleGradOpInplaceInferer,
+                  InstanceNormDoubleGradInferShapeFunctor);
 
 REGISTER_OP_VERSION(instance_norm)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/instance_norm_op.cu b/paddle/fluid/operators/instance_norm_op.cu
deleted file mode 100644
index e51cd9835318a..0000000000000
--- a/paddle/fluid/operators/instance_norm_op.cu
+++ /dev/null
@@ -1,818 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <cfloat>
-#include <string>
-#include <vector>
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-#include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/operators/instance_norm_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using DataLayout = framework::DataLayout;
-template <typename T>
-using CudnnDataType = platform::CudnnDataType<T>;
-template <typename T>
-using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
-
-template <typename T>
-static __global__ void repeat_param(const T *input, T *output,
-                                    const int repeat_num, const int C) {
-  CUDA_KERNEL_LOOP(i, repeat_num * C) {
-    int index = i % C;
-    output[i] = input[index];
-  }
-}
-
-template <typename T, int BlockDim, bool AVG>
-static __global__ void add_param(const T *input, T *output,
-                                 const int repeat_num, const int C) {
-  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage ou_storage;
-  for (int i = blockIdx.x; i < C; i += gridDim.x) {
-    T ou = static_cast<T>(0);
-    for (int j = threadIdx.x; j < repeat_num; j += blockDim.x) {
-      const int index = j * C + i;
-      ou += static_cast<T>(input[index]);
-    }
-    ou = BlockReduce(ou_storage).Reduce(ou, cub::Sum());
-    if (threadIdx.x == 0) {
-      output[i] = ou;
-    }
-    __syncthreads();
-
-    if (AVG) {
-      output[i] /= repeat_num;
-    }
-  }
-}
-
-template <typename T>
-class InstanceNormKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("It must be CUDAPlace."));
-    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-
-    auto *x = ctx.Input<Tensor>("X");
-    auto &x_dims = x->dims();
-    PADDLE_ENFORCE_GE(x_dims.size(), 2,
-                      platform::errors::InvalidArgument(
-                          "The `shape` in InstanceNormOp is invalid: "
-                          "the size of X's dimensions must greater than "
-                          "or equal to 2. But received: "
-                          "the size of X's dimensions is [%d]",
-                          x_dims.size()));
-    PADDLE_ENFORCE_LE(x_dims.size(), 5,
-                      platform::errors::InvalidArgument(
-                          "The `shape` in InstanceNormOp is invalid: "
-                          "the size of X's dimensions must smaller than"
-                          "or equal to 5. But received: "
-                          "the size of X's dimensions is [%d]",
-                          x_dims.size()));
-    int N, C, H, W, D;
-    ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
-    int NxC = N * C;
-    Tensor x_tmp;
-    x_tmp.ShareDataWith(*x).Resize({1, NxC, H, W, D});
-
-    auto *y = ctx.Output<Tensor>("Y");
-    y->mutable_data<T>(ctx.GetPlace());
-
-#ifdef PADDLE_WITH_HIP
-    miopenTensorDescriptor_t data_desc_;
-    miopenTensorDescriptor_t in_param_desc_;
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
-#else
-    cudnnTensorDescriptor_t data_desc_;
-    cudnnTensorDescriptor_t in_param_desc_;
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
-#endif
-    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
-      LOG(ERROR) << "Provided epsilon is smaller than "
-                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
-                 << "CUDNN_BN_MIN_EPSILON instead.";
-    }
-    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
-
-    VLOG(3) << "Setting descriptors.";
-    std::vector<int> dims;
-    std::vector<int> strides;
-    dims = {1, NxC, H, W, D};
-    strides = {NxC * H * W * D, H * W * D, W * D, D, 1};
-
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-
-#ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
-        data_desc_, CudnnDataType<T>::type,
-        x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
-        const_cast<int *>(strides.data())));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::miopenDeriveBNTensorDescriptor(
-            in_param_desc_, data_desc_, miopenBNSpatial));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-        data_desc_, CudnnDataType<T>::type,
-        x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor(
-        in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
-#endif
-
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-
-    Tensor scale_tmp =
-        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
-    scale_tmp.mutable_data<T>(ctx.GetPlace());
-    Tensor bias_tmp =
-        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
-    bias_tmp.mutable_data<T>(ctx.GetPlace());
-
-    const int n = x->numel();
-    const int block = 512;
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    const int max_blocks = std::max(max_threads / block, 1);
-    const int grid = std::min((NxC + block - 1) / block, max_blocks);
-
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_constant;
-    if (scale) {
-      repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
-          scale->data<T>(), scale_tmp.data<T>(), N, C);
-    } else {
-      set_constant(dev_ctx, &scale_tmp, static_cast<T>(1));
-    }
-    if (bias) {
-      repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
-          bias->data<T>(), bias_tmp.data<T>(), N, C);
-    } else {
-      set_constant(dev_ctx, &bias_tmp, static_cast<T>(0));
-    }
-
-    auto handle = dev_ctx.cudnn_handle();
-
-    phi::funcs::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
-        functor;
-
-    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
-    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
-    saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
-    functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
-
-#ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::miopenBatchNormalizationForwardTraining(
-            handle, miopenBNSpatial,
-            const_cast<void *>(
-                static_cast<const void *>(CudnnDataType<T>::kOne())),
-            const_cast<void *>(
-                static_cast<const void *>(CudnnDataType<T>::kZero())),
-            data_desc_, static_cast<const void *>(x_tmp.template data<T>()),
-            data_desc_,
-            static_cast<void *>(y->template mutable_data<T>(ctx.GetPlace())),
-            in_param_desc_,
-            const_cast<void *>(static_cast<const void *>(
-                scale_tmp.template data<BatchNormParamType<T>>())),
-            const_cast<void *>(static_cast<const void *>(
-                bias_tmp.template data<BatchNormParamType<T>>())),
-            0, nullptr, nullptr, epsilon,
-            static_cast<void *>(
-                saved_mean->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace())),
-            static_cast<void *>(
-                saved_variance->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()))));
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnBatchNormalizationForwardTraining(
-            handle, CUDNN_BATCHNORM_SPATIAL, CudnnDataType<T>::kOne(),
-            CudnnDataType<T>::kZero(), data_desc_, x_tmp.template data<T>(),
-            data_desc_, y->template mutable_data<T>(ctx.GetPlace()),
-            in_param_desc_, scale_tmp.template data<BatchNormParamType<T>>(),
-            bias_tmp.template data<BatchNormParamType<T>>(), 0, nullptr,
-            nullptr, epsilon,
-            saved_mean->template mutable_data<BatchNormParamType<T>>(
-                ctx.GetPlace()),
-            saved_variance->template mutable_data<BatchNormParamType<T>>(
-                ctx.GetPlace())));
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
-#endif
-  }
-};
-
-template <typename T, int BlockDim>
-static __global__ void GradComputeDX(const T *dy,
-                                     const BatchNormParamType<T> *scale,
-                                     const BatchNormParamType<T> *mean,
-                                     const T *x,
-                                     const BatchNormParamType<T> *variance,
-                                     const int C, const int sample_size,
-                                     T *dx) {
-  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
-  int end_idx = (blockIdx.x + 1) * sample_size;
-  int ncid = blockIdx.x;
-  int c = ncid % C;
-
-  BatchNormParamType<T> mean_val = mean[ncid];
-  BatchNormParamType<T> inv_var_val = variance[ncid];
-
-  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage dy_storage;
-  __shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage;
-  __shared__ BatchNormParamType<T> dy_sum_val;
-  __shared__ BatchNormParamType<T> dy_x_sub_mean_sum_val;
-
-  BatchNormParamType<T> dy_sum = static_cast<BatchNormParamType<T>>(0);
-  BatchNormParamType<T> dy_x_sub_mean_sum =
-      static_cast<BatchNormParamType<T>>(0);
-
-  for (int i = beg_idx; i < end_idx; i += BlockDim) {
-    BatchNormParamType<T> dy_i = static_cast<BatchNormParamType<T>>(dy[i]);
-    dy_sum += dy_i;
-    dy_x_sub_mean_sum +=
-        dy_i * (static_cast<BatchNormParamType<T>>(x[i]) - mean_val);
-  }
-  dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
-  dy_x_sub_mean_sum =
-      BlockReduce(dy_x_sub_mean_storage).Reduce(dy_x_sub_mean_sum, cub::Sum());
-
-  if (threadIdx.x == 0) {
-    dy_sum_val = dy_sum;
-    dy_x_sub_mean_sum_val = dy_x_sub_mean_sum;
-  }
-  __syncthreads();
-
-  for (int i = beg_idx; i < end_idx; i += BlockDim) {
-    dx[i] =
-        (static_cast<BatchNormParamType<T>>(dy[i]) -
-         dy_sum_val / static_cast<BatchNormParamType<T>>(sample_size) -
-         (static_cast<BatchNormParamType<T>>(x[i]) - mean_val) *
-             dy_x_sub_mean_sum_val * inv_var_val * inv_var_val / sample_size) *
-        scale[c] * inv_var_val;
-  }
-}
-
-template <typename T>
-class InstanceNormGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("It must use CUDAPlace."));
-    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-
-    const auto &x_dims = x->dims();
-
-    int N, C, H, W, D;
-    ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
-    int NxC = N * C;
-
-    Tensor x_tmp, d_y_tmp;
-    x_tmp.ShareDataWith(*x).Resize({1, NxC, H, W, D});
-    d_y_tmp.ShareDataWith(*d_y).Resize({1, NxC, H, W, D});
-
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    d_x->mutable_data<T>(ctx.GetPlace());
-    if (d_scale && d_bias) {
-      d_scale->mutable_data<T>(ctx.GetPlace());
-      d_bias->mutable_data<T>(ctx.GetPlace());
-    }
-    if (scale) {
-      PADDLE_ENFORCE_EQ(
-          scale->dims().size(), 1UL,
-          platform::errors::InvalidArgument(
-              "The `shape` in InstanceNormOp is invalid: "
-              "the size of scale's dimensions must be equal to 1. But "
-              "received: the size of scale's dimensions"
-              "is [%d]",
-              scale->dims().size()));
-      PADDLE_ENFORCE_EQ(scale->dims()[0], C,
-                        platform::errors::InvalidArgument(
-                            "The `shape` in InstanceNormOp is invalid: "
-                            "the first dimension of scale must be equal to "
-                            "Channels([%d]). But received: "
-                            "the first dimension of scale is [%d],"
-                            "the dimensions of scale is [%s], ",
-                            C, scale->dims()[0], scale->dims()));
-    }
-
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_constant;
-
-    const int n = x->numel();
-    const int block = 512;
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    const int max_blocks = std::max(max_threads / block, 1);
-    const int grid = std::min(NxC, max_blocks);
-    const int grid1 = (C + block - 1) / block;
-
-    Tensor scale_tmp =
-        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
-    scale_tmp.mutable_data<T>(ctx.GetPlace());
-    Tensor d_scale_tmp =
-        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
-    Tensor d_bias_tmp =
-        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
-    if (scale) {
-      repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
-          scale->data<T>(), scale_tmp.data<T>(), N, C);
-    } else {
-      set_constant(dev_ctx, &scale_tmp, static_cast<T>(1));
-    }
-
-    std::vector<int> dims;
-    std::vector<int> strides;
-    dims = {1, NxC, H, W, D};
-    strides = {NxC * H * W * D, H * W * D, W * D, D, 1};
-
-    if ((H * W * D) == 1) {
-      framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
-      phi::funcs::SetConstant<platform::CUDADeviceContext,
-                              BatchNormParamType<T>>
-          functor;
-      functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
-      functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
-      return;
-    }
-
-#ifdef PADDLE_WITH_HIP
-    miopenTensorDescriptor_t data_desc_;
-    miopenTensorDescriptor_t in_param_desc_;
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
-#else
-    cudnnTensorDescriptor_t data_desc_;
-    cudnnTensorDescriptor_t in_param_desc_;
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
-#endif
-
-    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
-      LOG(ERROR) << "Provided epsilon is smaller than "
-                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
-                 << "CUDNN_BN_MIN_EPSILON instead.";
-    }
-    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
-
-#ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
-        data_desc_, CudnnDataType<T>::type,
-        x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
-        const_cast<int *>(strides.data())));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::miopenDeriveBNTensorDescriptor(
-            in_param_desc_, data_desc_, miopenBNSpatial));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-        data_desc_, CudnnDataType<T>::type,
-        x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor(
-        in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
-#endif
-
-    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
-    const auto *saved_mean_data =
-        saved_mean->template data<BatchNormParamType<T>>();
-    const auto *saved_var_data =
-        saved_var->template data<BatchNormParamType<T>>();
-    if (d_scale && d_bias) {
-#ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::miopenBatchNormalizationBackward(
-              dev_ctx.cudnn_handle(), miopenBNSpatial, CudnnDataType<T>::kOne(),
-              CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
-              CudnnDataType<T>::kZero(), data_desc_, x_tmp.template data<T>(),
-              data_desc_, d_y_tmp.template data<T>(), data_desc_,
-              d_x->template mutable_data<T>(ctx.GetPlace()), in_param_desc_,
-              scale_tmp.template data<BatchNormParamType<T>>(),
-              d_scale_tmp.template mutable_data<BatchNormParamType<T>>(
-                  ctx.GetPlace()),
-              d_bias_tmp.template mutable_data<BatchNormParamType<T>>(
-                  ctx.GetPlace()),
-              epsilon, saved_mean_data, saved_var_data));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnBatchNormalizationBackward(
-              dev_ctx.cudnn_handle(), CUDNN_BATCHNORM_SPATIAL,
-              CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(),
-              CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(), data_desc_,
-              x_tmp.template data<T>(), data_desc_, d_y_tmp.template data<T>(),
-              data_desc_, d_x->template mutable_data<T>(ctx.GetPlace()),
-              in_param_desc_, scale_tmp.template data<BatchNormParamType<T>>(),
-              d_scale_tmp.template mutable_data<BatchNormParamType<T>>(
-                  ctx.GetPlace()),
-              d_bias_tmp.template mutable_data<BatchNormParamType<T>>(
-                  ctx.GetPlace()),
-              epsilon, saved_mean_data, saved_var_data));
-#endif
-    } else {
-      if (d_x) {
-        GradComputeDX<T, block><<<NxC, block, 0, dev_ctx.stream()>>>(
-            d_y->data<T>(), scale_tmp.data<BatchNormParamType<T>>(),
-            saved_mean_data, x->data<T>(), saved_var_data, C, H * W * D,
-            d_x->data<T>());
-      }
-    }
-
-    if (d_scale && d_bias) {
-      add_param<T, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
-          d_scale_tmp.data<T>(), d_scale->data<T>(), N, C);
-      add_param<T, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
-          d_bias_tmp.data<T>(), d_bias->data<T>(), N, C);
-    }
-
-#ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
-#endif
-  }
-};
-
-static __device__ __forceinline__ float real_sqrt(float x) {
-  return 1. / sqrtf(x);
-}
-static __device__ __forceinline__ double real_sqrt(double x) {
-  return 1. / sqrt(x);
-}
-
-template <typename T, int BlockDim>
-__global__ void DoubleGradComputeDX(const T *x, const T *mean,
-                                    const T *variance, const T *ddx,
-                                    const T *dy, const T *scale,
-                                    const T *ddscale, int C, int sample_size,
-                                    const double epsilon, T *dx) {
-  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
-  int end_idx = (blockIdx.x + 1) * sample_size;
-  int ncid = blockIdx.x;
-  int c = ncid % C;
-
-  T mean_val = mean[ncid];
-  T var_val = variance[ncid];
-
-  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage dy_storage;
-  __shared__ typename BlockReduce::TempStorage ddx_storage;
-  __shared__ typename BlockReduce::TempStorage dy_mul_ddx_storage;
-  __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage;
-  __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage;
-  __shared__ T dy_sum_val;
-  __shared__ T ddx_sum_val;
-  __shared__ T dy_mul_ddx_sum_val;
-  __shared__ T dy_mul_x_sub_mean_sum_val;
-  __shared__ T ddx_mul_x_sub_mean_sum_val;
-
-  T dy_sum = 0;
-  T ddx_sum = 0;
-  T dy_mul_ddx_sum = 0;
-  T dy_mul_x_sub_mean_sum = 0;
-  T ddx_mul_x_sub_mean_sum = 0;
-  for (int i = beg_idx; i < end_idx; i += BlockDim) {
-    T ddx_i = ddx[i];
-    T dy_i = dy[i];
-    T tmp = x[i] - mean_val;
-
-    dy_sum += dy_i;
-    ddx_sum += ddx_i;
-    dy_mul_ddx_sum += (ddx_i * dy_i);
-
-    dy_mul_x_sub_mean_sum += (dy_i * tmp);
-    ddx_mul_x_sub_mean_sum += (ddx_i * tmp);
-  }
-
-  dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
-  ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum());
-  dy_mul_ddx_sum =
-      BlockReduce(dy_mul_ddx_storage).Reduce(dy_mul_ddx_sum, cub::Sum());
-  dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage)
-                              .Reduce(dy_mul_x_sub_mean_sum, cub::Sum());
-  ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage)
-                               .Reduce(ddx_mul_x_sub_mean_sum, cub::Sum());
-
-  if (threadIdx.x == 0) {
-    dy_sum_val = dy_sum;
-    ddx_sum_val = ddx_sum;
-    dy_mul_ddx_sum_val = dy_mul_ddx_sum;
-    dy_mul_x_sub_mean_sum_val = dy_mul_x_sub_mean_sum;
-    ddx_mul_x_sub_mean_sum_val = ddx_mul_x_sub_mean_sum;
-  }
-  __syncthreads();
-
-  if (ddx != nullptr) {
-    for (int i = beg_idx; i < end_idx; i += BlockDim) {
-      dx[i] +=
-          ((x[i] - mean_val) * var_val * var_val * var_val / sample_size *
-               (ddx_sum_val * dy_sum_val / sample_size - dy_mul_ddx_sum_val +
-                3. * dy_mul_x_sub_mean_sum_val * var_val *
-                    ddx_mul_x_sub_mean_sum_val * var_val / sample_size) +
-           ddx_mul_x_sub_mean_sum_val * var_val / sample_size * var_val *
-               var_val * (dy_sum_val / sample_size - dy[i]) +
-           dy_mul_x_sub_mean_sum_val * var_val / sample_size * var_val *
-               var_val * (ddx_sum_val / sample_size - ddx[i])) *
-          scale[c];
-    }
-  }
-  __syncthreads();
-  if (ddscale != nullptr) {
-    for (int i = beg_idx; i < end_idx; i += BlockDim) {
-      dx[i] += (dy[i] * var_val - dy_sum_val / sample_size * var_val -
-                (x[i] - mean_val) * var_val * dy_mul_x_sub_mean_sum_val *
-                    var_val / sample_size) *
-               ddscale[c];
-    }
-  }
-}
-
-template <typename T, int BlockDim>
-__global__ void DoubleGradComputeDDY(const T *x, const T *mean,
-                                     const T *variance, const T *ddscale,
-                                     const T *ddbias, const T *ddx,
-                                     const T *scale, int C, int sample_size,
-                                     const double epsilon, T *ddy) {
-  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
-  int end_idx = (blockIdx.x + 1) * sample_size;
-  int ncid = blockIdx.x;
-  int c = ncid % C;
-
-  T mean_val = mean[ncid];
-  T var_val = variance[ncid];
-
-  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage ddx_storage;
-  __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage;
-  __shared__ T ddx_sum_val;
-  __shared__ T ddx_mul_x_sub_mean_sum_val;
-
-  T ddx_sum = 0;
-  T ddx_mul_x_sub_mean_sum = 0;
-  for (int i = beg_idx; i < end_idx; i += BlockDim) {
-    T ddx_i = ddx[i];
-    ddx_sum += ddx_i;
-    ddx_mul_x_sub_mean_sum += (ddx_i * (x[i] - mean_val));
-  }
-  ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum());
-  ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage)
-                               .Reduce(ddx_mul_x_sub_mean_sum, cub::Sum());
-
-  if (threadIdx.x == 0) {
-    ddx_sum_val = ddx_sum;
-    ddx_mul_x_sub_mean_sum_val = ddx_mul_x_sub_mean_sum;
-  }
-  __syncthreads();
-
-  if (ddx != nullptr) {
-    for (int i = beg_idx; i < end_idx; i += BlockDim) {
-      ddy[i] += scale[c] * var_val *
-                (ddx[i] - ddx_sum_val / sample_size -
-                 (x[i] - mean_val) * var_val * ddx_mul_x_sub_mean_sum_val *
-                     var_val / sample_size);
-    }
-  }
-  __syncthreads();
-  if (ddscale != nullptr) {
-    for (int i = beg_idx; i < end_idx; i += BlockDim) {
-      ddy[i] += (x[i] - mean_val) * var_val * ddscale[c];
-    }
-  }
-  __syncthreads();
-  if (ddbias != nullptr) {
-    for (int i = beg_idx; i < end_idx; i += BlockDim) {
-      ddy[i] += ddbias[c];
-    }
-  }
-}
-
-template <typename T, int BlockDim>
-__global__ void DoubleGradComputeDScale(const T *x, const T *mean,
-                                        const T *variance, const T *ddx,
-                                        const T *dy, int C, int sample_size,
-                                        const double epsilon, T *dscale) {
-  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
-  int end_idx = (blockIdx.x + 1) * sample_size;
-  int ncid = blockIdx.x;
-  int c = ncid % C;
-
-  T mean_val = mean[ncid];
-  T var_val = variance[ncid];
-
-  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage dy_storage;
-  __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage;
-  __shared__ typename BlockReduce::TempStorage dscale_tmp_storage;
-  __shared__ T dy_sum_val;
-  __shared__ T dy_mul_x_sub_mean_sum_val;
-
-  T dy_sum = 0;
-  T dy_mul_x_sub_mean_sum = 0;
-  for (int i = beg_idx; i < end_idx; i += BlockDim) {
-    T dy_i = dy[i];
-    dy_sum += dy_i;
-    dy_mul_x_sub_mean_sum += (dy_i * (x[i] - mean_val));
-  }
-  dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
-  dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage)
-                              .Reduce(dy_mul_x_sub_mean_sum, cub::Sum());
-
-  if (threadIdx.x == 0) {
-    dy_sum_val = dy_sum;
-    dy_mul_x_sub_mean_sum_val = dy_mul_x_sub_mean_sum;
-  }
-  __syncthreads();
-
-  if (ddx != nullptr) {
-    T dscale_tmp = 0;
-    for (int i = beg_idx; i < end_idx; i += BlockDim) {
-      dscale_tmp +=
-          ddx[i] * var_val * (dy[i] - dy_sum_val / sample_size -
-                              dy_mul_x_sub_mean_sum_val * (x[i] - mean_val) *
-                                  var_val * var_val / sample_size);
-    }
-    dscale_tmp = BlockReduce(dscale_tmp_storage).Reduce(dscale_tmp, cub::Sum());
-
-    if (threadIdx.x == 0) {
-      dscale[ncid] += dscale_tmp;
-    }
-    __syncthreads();
-  }
-}
-
-template <typename T>
-class InstanceNormDoubleGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *X = ctx.Input<Tensor>("X");
-    const auto *Scale = ctx.Input<Tensor>("Scale");
-    const auto *dY = ctx.Input<Tensor>("DY");
-    const auto *Saved_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *Saved_variance = ctx.Input<Tensor>("SavedVariance");
-    const auto *running_mean = ctx.Input<Tensor>("Mean");
-    const auto *running_var = ctx.Input<Tensor>("Variance");
-    const auto *ddX = ctx.Input<Tensor>("DDX");
-    const auto *ddScale = ctx.Input<Tensor>("DDScale");
-    const auto *ddBias = ctx.Input<Tensor>("DDBias");
-    const double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-
-    auto *dX = ctx.Output<Tensor>("DX");
-    auto *dScale = ctx.Output<Tensor>("DScale");
-    auto *ddY = ctx.Output<Tensor>("DDY");
-
-    const T *x_data = X->data<T>();
-    const T *dy_data = dY->data<T>();
-    const T *ddx_data = (ddX == nullptr ? nullptr : ddX->data<T>());
-
-    const T *ddscale_data = (ddScale == nullptr ? nullptr : ddScale->data<T>());
-    const T *ddbias_data = (ddScale == nullptr ? nullptr : ddBias->data<T>());
-
-    const T *mean_data = Saved_mean->data<T>();
-    const T *variance_data = Saved_variance->data<T>();
-
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
-
-    auto &x_dims = X->dims();
-    int N, C, H, W, D;
-    ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
-    int NxC = N * C;
-    const int n = X->numel();
-    int sample_size = n / N / C;
-
-    Tensor scale_tmp;
-    if (!Scale) {
-      scale_tmp.mutable_data<T>({C}, ctx.GetPlace());
-      set_zero(dev_ctx, &scale_tmp, static_cast<T>(1));
-    }
-    const T *scale_data = Scale ? Scale->data<T>() : scale_tmp.data<T>();
-
-    const int block = 512;
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    const int max_blocks = std::max(max_threads / block, 1);
-    const int grid = NxC;
-    const int grid1 = (C + block - 1) / block;
-
-    if (dX) {
-      T *dx_data = dX->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, dX, static_cast<T>(0));
-      DoubleGradComputeDX<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
-          x_data, mean_data, variance_data, ddx_data, dy_data, scale_data,
-          ddscale_data, C, sample_size, epsilon, dx_data);
-    }
-    if (dScale) {
-      Tensor dscale_tmp =
-          ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
-      set_zero(dev_ctx, &dscale_tmp, static_cast<T>(0));
-      T *dscale_tmp_data = dscale_tmp.mutable_data<T>(ctx.GetPlace());
-
-      T *dscale_data = dScale->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, dScale, static_cast<T>(0));
-      DoubleGradComputeDScale<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
-          x_data, mean_data, variance_data, ddx_data, dy_data, C, sample_size,
-          epsilon, dscale_tmp_data);
-      add_param<T, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
-          dscale_tmp.data<T>(), dScale->data<T>(), N, C);
-    }
-    if (ddY) {
-      T *ddy_data = ddY->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, ddY, static_cast<T>(0));
-      DoubleGradComputeDDY<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
-          x_data, mean_data, variance_data, ddscale_data, ddbias_data, ddx_data,
-          scale_data, C, sample_size, epsilon, ddy_data);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-#ifdef PADDLE_WITH_HIP
-// MIOPEN do not support double
-REGISTER_OP_CUDA_KERNEL(
-    instance_norm, ops::InstanceNormKernel<plat::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    instance_norm_grad,
-    ops::InstanceNormGradKernel<plat::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(instance_norm_grad_grad,
-                        ops::InstanceNormDoubleGradKernel<
-                            paddle::platform::CUDADeviceContext, float>);
-#else
-REGISTER_OP_CUDA_KERNEL(
-    instance_norm, ops::InstanceNormKernel<plat::CUDADeviceContext, float>,
-    ops::InstanceNormKernel<plat::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    instance_norm_grad,
-    ops::InstanceNormGradKernel<plat::CUDADeviceContext, float>,
-    ops::InstanceNormGradKernel<plat::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    instance_norm_grad_grad,
-    ops::InstanceNormDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                      float>,
-    ops::InstanceNormDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                      double>);
-#endif
diff --git a/paddle/fluid/operators/instance_norm_op.h b/paddle/fluid/operators/instance_norm_op.h
index 493f54ab3baa6..265e4acef0d7a 100644
--- a/paddle/fluid/operators/instance_norm_op.h
+++ b/paddle/fluid/operators/instance_norm_op.h
@@ -16,9 +16,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <unordered_map>
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/norm_utils.h"
 
 namespace paddle {
 namespace operators {
@@ -27,22 +25,9 @@ using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 using DataLayout = framework::DataLayout;
 
-template <typename T>
-using EigenArrayMap =
-    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using ConstEigenArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
-template <typename T>
-using ConstEigenVectorArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
-
 class InstanceNormOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override;
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -52,7 +37,6 @@ class InstanceNormOp : public framework::OperatorWithKernel {
 class InstanceNormGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override;
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -62,7 +46,6 @@ class InstanceNormGradOp : public framework::OperatorWithKernel {
 class InstanceNormDoubleGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override;
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -130,23 +113,5 @@ class InstanceNormOpInferVarType
   }
 };
 
-template <typename DeviceContext, typename T>
-class InstanceNormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override;
-};
-
-template <typename DeviceContext, typename T>
-class InstanceNormGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override;
-};
-
-template <typename DeviceContext, typename T>
-class InstanceNormDoubleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override;
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/sample_prob.h b/paddle/fluid/operators/math/sample_prob.h
index 09daf0afe18bf..18a86d1531724 100644
--- a/paddle/fluid/operators/math/sample_prob.h
+++ b/paddle/fluid/operators/math/sample_prob.h
@@ -52,7 +52,7 @@ class SampleWithProb {
                   const std::size_t num_samples, const Tensor* L, Tensor* S,
                   Tensor* P) {
     // UNDERSTAND: dimension issues
-    const auto lbl_dim = L->dims();
+    const auto& lbl_dim = L->dims();
     const int batch_size = lbl_dim[0];
     const int num_true = lbl_dim[1];
     const int num_sampled_classes = num_true + num_samples;
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index a880afb0e9be3..e4b033b6c5857 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -98,8 +98,8 @@ struct SelectedRowsAddTensor<platform::CPUDeviceContext, T> {
                   const phi::SelectedRows& input1,
                   const framework::Tensor& input2, framework::Tensor* output) {
     auto in1_height = input1.height();
-    auto in2_dims = input2.dims();
-    auto out_dims = output->dims();
+    const auto& in2_dims = input2.dims();
+    const auto& out_dims = output->dims();
     PADDLE_ENFORCE_EQ(
         in1_height, in2_dims[0],
         platform::errors::InvalidArgument("The two inputs height must be equal."
@@ -249,7 +249,7 @@ struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
       return;
     }
     auto in1_height = input1.height();
-    auto in2_dims = input2->dims();
+    const auto& in2_dims = input2->dims();
     PADDLE_ENFORCE_EQ(
         in1_height, in2_dims[0],
         platform::errors::InvalidArgument("The two inputs height must be equal."
@@ -289,7 +289,7 @@ struct SelectedRowsAddToTensor<phi::CPUContext, T> {
       return;
     }
     auto in1_height = input1.height();
-    auto in2_dims = input2->dims();
+    const auto& in2_dims = input2->dims();
     PADDLE_ENFORCE_EQ(
         in1_height, in2_dims[0],
         platform::errors::InvalidArgument("The two inputs height must be equal."
@@ -838,7 +838,7 @@ struct UpdateToTensor<platform::CPUDeviceContext, T> {
                   const ScatterOps& op, const phi::SelectedRows& input1,
                   framework::Tensor* input2) {
     auto in1_height = input1.height();
-    auto in2_dims = input2->dims();
+    const auto& in2_dims = input2->dims();
     PADDLE_ENFORCE_EQ(
         in1_height, in2_dims[0],
         platform::errors::InvalidArgument("The two inputs height must be equal."
diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h
index 9833b4447ec45..69642c8194221 100644
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -231,7 +231,7 @@ class SoftmaxFunctor<DeviceContext, T, is_test, enable_if_CPU<DeviceContext>> {
  public:
   void operator()(const DeviceContext& context, const int axis_dim,
                   const framework::Tensor* X, framework::Tensor* Y) {
-    auto in_dims = X->dims();
+    const auto& in_dims = X->dims();
     constexpr int kBatchDim = 0;
     constexpr int kClassDim = 1;
 
diff --git a/paddle/fluid/operators/math/tree2col.cc b/paddle/fluid/operators/math/tree2col.cc
index c07582c84acb9..cd1fa13001ce2 100644
--- a/paddle/fluid/operators/math/tree2col.cc
+++ b/paddle/fluid/operators/math/tree2col.cc
@@ -53,7 +53,7 @@ std::vector<TreeNode> Tree2ColUtil::construct_patch(
 void Tree2ColUtil::construct_tree(const framework::Tensor &EdgeSet,
                                   std::vector<std::vector<int>> *tr,
                                   size_t *node_count) {
-  auto edge_set_dims = EdgeSet.dims();
+  const auto &edge_set_dims = EdgeSet.dims();
   PADDLE_ENFORCE_EQ(edge_set_dims[1], 2,
                     platform::errors::InvalidArgument(
                         "The second dimension of the EdgeSet shall be 2, but "
@@ -89,7 +89,7 @@ class Tree2ColFunctor<platform::CPUDeviceContext, T> {
                   const framework::Tensor &node_features,
                   framework::Tensor *patch, int max_depth) {
     std::vector<std::vector<int>> tr;
-    auto feature_dims = node_features.dims();
+    const auto &feature_dims = node_features.dims();
     auto cpu_place = context.GetPlace();
     phi::funcs::SetConstant<platform::CPUDeviceContext, T> constant;
     int64_t feature_size = feature_dims[1];
@@ -142,7 +142,7 @@ class Col2TreeFunctor<platform::CPUDeviceContext, T> {
                   const framework::Tensor &out_grad, framework::Tensor *in_grad,
                   int max_depth) {
     std::vector<std::vector<int>> tr;
-    auto output_dims = out_grad.dims();
+    const auto &output_dims = out_grad.dims();
     auto cpu_place = context.GetPlace();
     phi::funcs::SetConstant<platform::CPUDeviceContext, T> constant;
     int64_t output_size = output_dims[1];
diff --git a/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
index 05d6bae5f719a..91dccbee0aef2 100644
--- a/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
@@ -45,19 +45,17 @@ class ExpandMKLDNNKernel : public paddle::framework::OpKernel<T> {
       out_new_dims[i] = out_new_dims[i] > 0 ? out_new_dims[i] : x_vec_dims[i];
     }
 
-    dnnl::memory::desc x_mem_desc = x->mem_desc();
     if (x_vec_dims.size() != out_new_dims.size()) {
-      x_mem_desc = GetExtendedMemoryDescriptor(x_mem_desc, x_vec_dims,
-                                               out_new_dims.size());
+      x_vec_dims = GetExtendedXDims(x_vec_dims, out_new_dims.size());
     }
 
     out->Resize(phi::make_ddim(out_new_dims));
     paddle::platform::BroadcastDataMKLDNNHandler<T> handler(
-        dnnl::algorithm::binary_add, onednn_engine, ctx.GetPlace(), out, x,
-        0.0f, 1.0f, x_mem_desc);
+        dnnl::algorithm::binary_add, onednn_engine, ctx.GetPlace(), x, out,
+        0.0f, 1.0f, x_vec_dims);
 
     auto src_memory_p = handler.AcquireSrcMemory(x);
-    auto dst_memory_p = handler.AcquireDstMemory(out);  // acquires zeroed mem
+    auto dst_memory_p = handler.AcquireZeroedDstMemory(out);
     auto binary_p = handler.AcquireForwardPrimitive();
 
     const std::unordered_map<int, dnnl::memory> args = {
@@ -73,14 +71,13 @@ class ExpandMKLDNNKernel : public paddle::framework::OpKernel<T> {
   }
 
  private:
-  dnnl::memory::desc GetExtendedMemoryDescriptor(
-      const dnnl::memory::desc& x_mem_desc,
-      const std::vector<int64_t>& x_vec_dims, int new_size) const {
-    std::vector<int64_t> new_dims(new_size, 1);
+  std::vector<int64_t> GetExtendedXDims(const std::vector<int64_t>& x_vec_dims,
+                                        int new_size) const {
+    std::vector<int64_t> extended_x_dims(new_size, 1);
     std::copy(x_vec_dims.begin(), x_vec_dims.end(),
-              new_dims.begin() + new_size - x_vec_dims.size());
+              extended_x_dims.begin() + new_size - x_vec_dims.size());
 
-    return x_mem_desc.reshape(new_dims);
+    return extended_x_dims;
   }
 };
 
diff --git a/paddle/fluid/operators/mkldnn/fill_constant_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fill_constant_mkldnn_op.cc
index cfc320da47fff..73e783068379d 100644
--- a/paddle/fluid/operators/mkldnn/fill_constant_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fill_constant_mkldnn_op.cc
@@ -79,8 +79,10 @@ class FillConstantMKLDNNKernel : public framework::OpKernel<T> {
                                        {DNNL_ARG_DST, *src0_memory_p}});
     astream.wait();
 
-    out->set_layout(framework::DataLayout::kMKLDNN);
-    out->set_format(platform::GetPlainMKLDNNFormat(out->dims().size()));
+    // src0_memory_p's md was just to allow the usage of a binary
+    // primitive as a memset, and now we need to create a real one
+    out->set_mem_desc({phi::vectorize(shape), platform::MKLDNNGetDataType<T>(),
+                       platform::GetPlainMKLDNNFormat(shape.size())});
   }
 
   T CalculateFillValue(const framework::ExecutionContext& ctx) const {
diff --git a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
index d3a36555c389a..245ae2196ca38 100644
--- a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
@@ -124,7 +124,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
     if (!workspace_memory->get_desc().is_zero()) {
-      mid->set_format(platform::GetMKLDNNFormat(*workspace_memory));
+      mid->set_mem_desc(workspace_memory->get_desc());
       lrn_p->execute(astream, {{DNNL_ARG_SRC, *src_memory},
                                {DNNL_ARG_DST, *dst_memory},
                                {DNNL_ARG_WORKSPACE, *workspace_memory}});
@@ -134,8 +134,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     }
     astream.wait();
 
-    out->set_layout(framework::DataLayout::kMKLDNN);
-    out->set_format(platform::GetMKLDNNFormat(*dst_memory));
+    out->set_mem_desc(dst_memory->get_desc());
   }
 };
 
@@ -177,8 +176,7 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
                                {DNNL_ARG_WORKSPACE, *workspace}});
     astream.wait();
 
-    in_x_grad->set_layout(framework::DataLayout::kMKLDNN);
-    in_x_grad->set_format(platform::GetMKLDNNFormat(*diff_src_memory));
+    in_x_grad->set_mem_desc(diff_src_memory->get_desc());
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake b/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake
index 3ebfbdc50caab..8bad3e86b2934 100644
--- a/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake
+++ b/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake
@@ -1 +1 @@
-cc_test(test_mkldnn_op_nhwc SRCS mkldnn/test_mkldnn_op_nhwc.cc DEPS op_registry pool_op shape_op activation_op pooling transpose_op scope device_context enforce executor)
+cc_test(test_mkldnn_op_nhwc SRCS mkldnn/test_mkldnn_op_nhwc.cc DEPS op_registry pool_op shape_op crop_op activation_op pooling transpose_op scope device_context enforce executor)
diff --git a/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc
index 2a8627b803a6e..2df9e5c20fda8 100644
--- a/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc
@@ -175,19 +175,17 @@ class SliceGradMKLDNNKernel : public framework::OpKernel<T> {
 
     dnnl::memory::data_type dout_type = framework::ToMKLDNNDataType(
         framework::TransToProtoVarType(dout->dtype()));
-    dnnl::memory::desc md(dout_vec_dims, platform::MKLDNNGetDataType<T>(),
-                          dout->format());
-    dnnl::memory::format_tag reorder_format_tag =
-        platform::GetMKLDNNFormat(md.reshape(slice_dims));
 
     platform::ReorderMKLDNNHandler reorder_handler(
         slice_dims, framework::TransToProtoVarType(dout->dtype()), dout_type,
         onednn_engine);
 
     auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
-        reorder_format_tag, platform::to_void_cast(dout->data<T>()));
+        dout->mem_desc().reshape(slice_dims),
+        platform::to_void_cast(dout->data<T>()));
     auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
-        dx, dx_vec_dims, reorder_format_tag, ctx.GetPlace());
+        dx, dx_vec_dims, platform::GetPlainMKLDNNFormat(dx_vec_dims.size()),
+        ctx.GetPlace());
     memset(dx->data<T>(), 0, reorder_dst_memory_p->get_desc().get_size());
 
     auto slice_mem_p = reorder_handler.AcquireSubmemory(slice_dims, offsets,
@@ -199,8 +197,7 @@ class SliceGradMKLDNNKernel : public framework::OpKernel<T> {
     reorder_p->execute(astream, *reorder_src_memory_p, *slice_mem_p);
     astream.wait();
 
-    dx->set_layout(framework::DataLayout::kMKLDNN);
-    dx->set_format(reorder_format_tag);
+    dx->set_mem_desc(reorder_dst_memory_p->get_desc());
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc
index 36be1681b05e7..28a00be5fa47e 100644
--- a/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc
@@ -59,7 +59,7 @@ class StackMKLDNNHandler
     // wrong output format deduction and suboptimal performance as a result
     if (stack_axis != ndims) {
       for (size_t i = 0; i < inputs.size(); ++i) {
-        srcs_md.emplace_back(memory::desc(input_dims, dt, inputs[i]->format()));
+        srcs_md.push_back(inputs[i]->mem_desc());
       }
 
       input_dims[stack_axis] *= inputs.size();
@@ -69,8 +69,7 @@ class StackMKLDNNHandler
       extended_input_dims[stack_axis] = 1;
 
       for (size_t i = 0; i < inputs.size(); ++i) {
-        srcs_md.emplace_back(memory::desc(input_dims, dt, inputs[i]->format())
-                                 .reshape(extended_input_dims));
+        srcs_md.push_back(inputs[i]->mem_desc().reshape(extended_input_dims));
       }
 
       // concat primitive choses suboptimal format tag because it cannot
@@ -130,9 +129,8 @@ class StackMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     concat_p->execute(astream, args);
     astream.wait();
 
-    output->set_layout(DataLayout::kMKLDNN);
-    output->set_format(platform::GetMKLDNNFormat(
-        dst_mem->get_desc().reshape(phi::vectorize(output->dims()))));
+    output->set_mem_desc(
+        dst_mem->get_desc().reshape(phi::vectorize(output->dims())));
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
index 99f957f573a17..de21c2687bd44 100644
--- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
@@ -60,17 +60,16 @@ class SumMKLDNNHandler
     auto src_tz = dst_tz;
 
     std::vector<dnnl::memory::desc> srcs_md;
+    srcs_md.reserve(in_vars.size());
     for (size_t i = 0; i < in_vars.size(); i++) {
       auto& input_it = in_vars[i]->Get<framework::LoDTensor>();
       if (input_it.numel() == 0) {
         continue;
       }
-      MKLDNNMemoryFormat input_format = input_it.format();
-      srcs_md.push_back(dnnl::memory::desc(
-          src_tz, platform::MKLDNNGetDataType<T>(), input_format));
+      srcs_md.push_back(input_it.mem_desc());
       ++num_inputs_;
     }
-    std::vector<float> scales(num_inputs_, 1.0);
+    std::vector<float> scales(num_inputs_, 1.0f);
 
     auto dst_md = dnnl::memory::desc(dst_tz, platform::MKLDNNGetDataType<T>(),
                                      MKLDNNMemoryFormat::any);
@@ -139,47 +138,27 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       ++input_index;
     }
 
-    std::shared_ptr<dnnl::memory> dst_mem = nullptr;
+    std::unordered_map<int, dnnl::memory> args;
+    std::shared_ptr<dnnl::memory> dst_mem;
+
+    for (size_t i = 0; i < srcs_mem.size(); ++i) {
+      args.insert({DNNL_ARG_MULTIPLE_SRC + i, *(srcs_mem[i])});
+    }
+
     if (in_place) {
-      dst_mem = handler.AcquireDstMemory();
-      output->mutable_data<T>(ctx.GetPlace());
+      dst_mem = srcs_mem[0];
     } else {
       dst_mem = handler.AcquireDstMemory(output);
     }
+    args.insert({DNNL_ARG_DST, *dst_mem});
 
     auto sum_p = handler.AcquireForwardPrimitive();
 
-    std::unordered_map<int, dnnl::memory> args;
-    for (size_t i = 0; i < srcs_mem.size(); ++i) {
-      args.insert({DNNL_ARG_MULTIPLE_SRC + i, *(srcs_mem[i])});
-    }
-    args.insert({DNNL_ARG_DST, *dst_mem});
-
     auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
     sum_p->execute(astream, args);
     astream.wait();
 
-    // For in-place execution which sum does not have we need to fake it
-    // so from oneDNN dst memory we reorder data into input
-    if (in_place) {
-      auto& in_out = in_vars[0]->Get<framework::LoDTensor>();
-      auto output_tz = phi::vectorize<int64_t>(output->dims());
-      platform::ReorderMKLDNNHandler reorder_handler(
-          output_tz, framework::TransToProtoVarType(output->dtype()),
-          framework::ToMKLDNNDataType(
-              framework::TransToProtoVarType(in_out.dtype())),
-          dev_ctx.GetEngine());
-
-      auto target_mem = reorder_handler.AcquireDstMemory(
-          output, in_out.format(), ctx.GetPlace());
-
-      auto reorder_p = reorder_handler.AcquireReorder(target_mem, dst_mem);
-
-      reorder_p->execute(astream, *dst_mem, *target_mem);
-      astream.wait();
-    }
-    output->set_layout(framework::DataLayout::kMKLDNN);
-    output->set_format(platform::GetMKLDNNFormat(*dst_mem));
+    output->set_mem_desc(dst_mem->get_desc());
   }
 };
 
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
index 4ff93ee3cd624..b9866ba8c3647 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
@@ -34,6 +34,8 @@ USE_OP_ITSELF(transpose);
 USE_OP_DEVICE_KERNEL(transpose, MKLDNN);
 USE_OP_ITSELF(shape);
 USE_OP_DEVICE_KERNEL(shape, MKLDNN);
+USE_OP_ITSELF(crop);
+USE_OP_DEVICE_KERNEL(crop, CPU);
 
 PD_DECLARE_KERNEL(pool2d, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(relu, CPU, ALL_LAYOUT);
@@ -211,5 +213,68 @@ TEST(test_pool2d_shape_nhwc, cpu_place) {
                         "Computed shape does not match expected shape"));
 }
 
+TEST(test_pool2d_crop_nhwc, cpu_place) {
+  framework::DDim dims({1, 4, 8, 512});           // NHWC shape
+  framework::DDim expected_dims({1, 3, 7, 512});  // NCHW expected shape
+  platform::CPUPlace p;
+  framework::Scope scope;
+
+  InputVars input_name = {"x",
+                          scope.Var("x")->GetMutable<framework::LoDTensor>()};
+  InputVars second_crop_input_name = {
+      "v", scope.Var("v")->GetMutable<framework::LoDTensor>()};
+  // Initialize input data
+  std::uniform_real_distribution<float> dist(10.0f, 20.0f);
+  std::mt19937 engine;
+  size_t numel = static_cast<size_t>(phi::product(dims));
+  input_name.tensor->Resize(dims);
+  auto data_ptr = input_name.tensor->mutable_data<float>(p);
+  for (size_t i = 0; i < numel; ++i) {
+    data_ptr[i] = dist(engine);
+  }
+  // Second input (Y) to crop is having no buffer
+  // but as it is MKLDNN then its shape order should be NCHW
+  auto expected_dims_nchw = phi::vectorize<int64_t>(expected_dims);
+  std::rotate(expected_dims_nchw.begin() + 1, expected_dims_nchw.end() - 1,
+              expected_dims_nchw.end());
+  second_crop_input_name.tensor->Resize(phi::make_ddim(expected_dims_nchw));
+  const auto second_crop_input_md =
+      dnnl::memory::desc(expected_dims_nchw, dnnl::memory::data_type::f32,
+                         dnnl::memory::format_tag::nhwc);
+  second_crop_input_name.tensor->set_mem_desc(second_crop_input_md);
+
+  scope.Var("y")->GetMutable<framework::LoDTensor>();
+  auto *z = scope.Var("z")->GetMutable<framework::LoDTensor>();
+
+  auto &pool = platform::DeviceContextPool::Instance();
+
+  // Make pool2d followed by crop. crop may have Y input as
+  // non buffered so the path to be executed is handling oneDNN kernel
+  // that is followed by CPU kernel with non-buffered Input
+
+  auto ksize = std::vector<int>(2, 2);
+  auto op_pool = framework::OpRegistry::CreateOp(
+      "pool2d", {{"X", {"x"}}}, {{"Out", {"y"}}},
+      {{"pooling_type", {std::string("max")}},
+       {"ksize", {ksize}},
+       {"data_format", {std::string("NHWC")}},
+       {"use_mkldnn", {true}}});
+
+  std::vector<int> offsets{0, 0, 0, 0};
+  auto op_crop = framework::OpRegistry::CreateOp(
+      "crop", {{"X", {"y"}}, {"Y", {"v"}}}, {{"Out", {"z"}}},
+      {{"offsets", {offsets}}});
+
+  op_pool->Run(scope, p);
+  op_crop->Run(scope, p);
+
+  pool.Get(p)->Wait();
+
+  // Verify shape of output
+  PADDLE_ENFORCE_EQ(z->dims(), expected_dims,
+                    platform::errors::InvalidArgument(
+                        "Output shape does not match expected output shape"));
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/nanmedian_op.cc b/paddle/fluid/operators/nanmedian_op.cc
new file mode 100644
index 0000000000000..23a497bdb1d3d
--- /dev/null
+++ b/paddle/fluid/operators/nanmedian_op.cc
@@ -0,0 +1,125 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace paddle {
+namespace operators {
+
+class NanmedianOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class NanmedianOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor), "
+             "the input feature data of NanmedianOp, dtype should be"
+             "int32, int64, float16, float32 or float64.");
+    AddOutput(
+        "MedianIndex",
+        "Store the index position of median values, The calculation differs "
+        "in the odd or even valid elements numbers."
+        "Along the axis, two elements contributed to the median value in "
+        "each row."
+        "If the amount of valid elements were even, both were the same.")
+        .AsIntermediate()
+        .AsExtra();
+    AddOutput("Out",
+              "(Tensor),"
+              " the output of  NanmedianOp, whose dtype is the same as X");
+    AddAttr<bool>("keepdim",
+                  "(bool, default true) "
+                  "If true, retain the reduced axis with length 1.")
+        .SetDefault(true);
+    AddAttr<std::vector<int>>("axis",
+                              "(std::vector<int>). List of integers,"
+                              " indicating the dimensions to calculate medians")
+        .SetDefault({});
+    AddComment(R"DOC(
+                Nanmedian operator
+
+                This operator is considered as an extention of median operation,
+                which supports specifically the case of NaN values in the input.
+
+                If all the elements in input are NaN it will also return NaN.
+                If no elements in input are Nan, this op is identical to thie median op.
+
+                If the valid count of elements is a even number, the average value of
+                the elements in the middle is calculated as the median.
+
+                This operator can also supports multiple axis.
+        )DOC");
+  }
+};
+
+template <typename T>
+class NanmedianGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("nanmedian_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("MedianIndex", this->Output("MedianIndex"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+class NanmedianGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.GetPlace());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(nanmedian, NanmedianInferShapeFunctor,
+                            PD_INFER_META(phi::NanmedianInferMeta));
+
+REGISTER_OPERATOR(nanmedian, ops::NanmedianOp, ops::NanmedianOpMaker,
+                  ops::NanmedianGradMaker<paddle::framework::OpDesc>,
+                  ops::NanmedianGradMaker<paddle::imperative::OpBase>,
+                  NanmedianInferShapeFunctor);
+
+DECLARE_INFER_SHAPE_FUNCTOR(nanmedian_grad, NanmedianGradInferShapeFunctor,
+                            PD_INFER_META(phi::NanmedianGradInferMeta));
+
+REGISTER_OPERATOR(nanmedian_grad, ops::NanmedianGradOp,
+                  NanmedianGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/one_hot_v2_op_mlu.cc b/paddle/fluid/operators/one_hot_v2_op_mlu.cc
new file mode 100644
index 0000000000000..855cdda963cb6
--- /dev/null
+++ b/paddle/fluid/operators/one_hot_v2_op_mlu.cc
@@ -0,0 +1,86 @@
+
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/operators/utils.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename T>
+class OneHotV2MLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::MLUDeviceContext>();
+    auto* in = ctx.Input<LoDTensor>("X");
+    auto* out = ctx.Output<LoDTensor>("Out");
+    int depth = ctx.Attr<int>("depth");
+    if (ctx.HasInput("depth_tensor")) {
+      std::vector<int32_t> depth_data;
+      depth_data = GetDataFromTensor<int>(ctx.Input<Tensor>("depth_tensor"));
+      depth = depth_data[0];
+
+      auto out_dims = out->dims();
+      out_dims[out_dims.size() - 1] = depth;
+      out->Resize(out_dims);
+    }
+    out->mutable_data<float>(ctx.GetPlace());
+
+    float on_value = 1.0f, off_value = 0.0f;
+    const int in_off_dim[1] = {1};
+    Tensor on_value_tensor = ctx.AllocateTmpTensor<float, MLUDeviceContext>(
+        framework::DDim(in_off_dim, 1), dev_ctx);
+    Tensor off_value_tensor = ctx.AllocateTmpTensor<float, MLUDeviceContext>(
+        framework::DDim(in_off_dim, 1), dev_ctx);
+    FillMLUTensorWithHostValue(ctx, on_value, &on_value_tensor);
+    FillMLUTensorWithHostValue(ctx, off_value, &off_value_tensor);
+
+    if (framework::TransToProtoVarType(in->dtype()) ==
+        framework::proto::VarType::INT32) {
+      MLUCnnlTensorDesc desc_indices(*in);
+      MLUCnnl::OneHot(ctx, desc_indices.get(), GetBasePtr(in), depth,
+                      GetBasePtr(&on_value_tensor),
+                      GetBasePtr(&off_value_tensor), -1,
+                      ToCnnlDataType(out->dtype()), GetBasePtr(out));
+    } else {
+      Tensor transformed_in;
+      transformed_in.mutable_data<int32_t>(in->dims(), dev_ctx.GetPlace());
+      // use cnnlCast to cast int64_t to int32_t then do one_hot
+      MLUCnnlTensorDesc in_desc(*in);
+      MLUCnnlTensorDesc transformed_in_desc(transformed_in);
+      cnnlCastDataType_t cast_type = GetCastDataType(
+          framework::TransToProtoVarType(in->dtype()),
+          framework::TransToProtoVarType(transformed_in.dtype()));
+      MLUCnnl::Cast(ctx, cast_type, in_desc.get(), GetBasePtr(in),
+                    transformed_in_desc.get(), GetBasePtr(&transformed_in));
+      MLUCnnl::OneHot(
+          ctx, transformed_in_desc.get(), GetBasePtr(&transformed_in), depth,
+          GetBasePtr(&on_value_tensor), GetBasePtr(&off_value_tensor), -1,
+          ToCnnlDataType(out->dtype()), GetBasePtr(out));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(one_hot_v2, ops::OneHotV2MLUKernel<int32_t>);
diff --git a/paddle/fluid/operators/optimizers/dgc_momentum_op.h b/paddle/fluid/operators/optimizers/dgc_momentum_op.h
index fc954e60a8c3e..9d6ecf414e664 100644
--- a/paddle/fluid/operators/optimizers/dgc_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/dgc_momentum_op.h
@@ -72,8 +72,7 @@ class DGCMomentumKernel : public framework::OpKernel<T> {
       auto* velocity_out = context.Output<framework::Tensor>("VelocityOut");
       auto* master_param_out =
           context.Output<framework::Tensor>("MasterParamOut");
-      paddle::optional<const framework::Tensor&> master_param_opt =
-          paddle::none;
+      paddle::optional<framework::Tensor> master_param_opt(paddle::none);
       float mu = context.Attr<float>("mu");
       bool use_nesterov = context.Attr<bool>("use_nesterov");
       std::string regularization_method =
@@ -117,8 +116,7 @@ class DGCMomentumKernel : public framework::OpKernel<T> {
       auto* param_out = context.Output<framework::Tensor>("ParamOut");
       auto* master_param_out =
           context.Output<framework::Tensor>("MasterParamOut");
-      paddle::optional<const framework::Tensor&> master_param_opt =
-          paddle::none;
+      paddle::optional<framework::Tensor> master_param_opt(paddle::none);
       if (multi_precision) {
         auto* master_param = context.Input<framework::Tensor>("MasterParam");
         master_param_opt = *master_param;
@@ -149,8 +147,7 @@ class DGCMomentumKernel : public framework::OpKernel<T> {
       auto* param_out = context.Output<phi::SelectedRows>("ParamOut");
       auto* master_param_out =
           context.Output<phi::SelectedRows>("MasterParamOut");
-      paddle::optional<const phi::SelectedRows&> master_param_opt =
-          paddle::none;
+      paddle::optional<phi::SelectedRows> master_param_opt(paddle::none);
       if (multi_precision) {
         auto* master_param = context.Input<phi::SelectedRows>("MasterParam");
         master_param_opt = *master_param;
diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
index 0c174b0825c9f..94d8cc41d3f31 100644
--- a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
+++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
@@ -29,11 +29,11 @@ inline std::vector<int64_t> CalculateReducedDims(
     bool reduce_all, bool keep_dim) {
   if (keep_dim) return phi::vectorize(output->dims());
 
-  if (reduce_all)
-    return std::vector<int64_t>(phi::vectorize(input->dims()).size(), 1);
+  if (reduce_all) return std::vector<int64_t>(input->dims().size(), 1);
 
   std::vector<int64_t> output_dims(phi::vectorize(input->dims()));
   for (size_t i = 0; i < reduce_dims.size(); ++i) {
+    // handle negative dims, f.e. "-1" means rightmost dimension
     reduce_dims[i] = (reduce_dims[i] >= 0)
                          ? reduce_dims[i]
                          : input->dims().size() + reduce_dims[i];
@@ -52,16 +52,16 @@ class ReduceMKLDNNKernel : public framework::OpKernel<T> {
         ctx.template device_context<platform::MKLDNNDeviceContext>();
     const auto& onednn_engine = dev_ctx.GetEngine();
 
-    const auto* input = ctx.Input<LoDTensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
+    const auto* x = ctx.Input<LoDTensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
 
     auto reduce_dims = ctx.Attr<std::vector<int>>("dim");
     bool reduce_all = ctx.Attr<bool>("reduce_all");
     bool keep_dim = ctx.Attr<bool>("keep_dim");
 
-    auto output_dims =
-        CalculateReducedDims(input, output, reduce_dims, reduce_all, keep_dim);
-    auto input_dims = phi::vectorize(input->dims());
+    auto x_tz = phi::vectorize(x->dims());
+    auto out_tz =
+        CalculateReducedDims(x, out, reduce_dims, reduce_all, keep_dim);
 
     auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
 
@@ -69,18 +69,19 @@ class ReduceMKLDNNKernel : public framework::OpKernel<T> {
     // copied without actual reduction.
     // In that case reorder must be executed to maintain compatibility with
     // PaddlePaddle reduce op
-    if (input_dims == output_dims) {
-      dnnl::memory::data_type input_type = framework::ToMKLDNNDataType(
-          framework::TransToProtoVarType(input->dtype()));
+    if (x_tz == out_tz) {
+      dnnl::memory::data_type x_type = framework::ToMKLDNNDataType(
+          framework::TransToProtoVarType(x->dtype()));
       platform::ReorderMKLDNNHandler reorder_handler(
-          input_dims, framework::TransToProtoVarType(input->dtype()),
-          input_type, onednn_engine);
+          x_tz, framework::TransToProtoVarType(x->dtype()), x_type,
+          onednn_engine);
 
       auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
-          input->mem_desc(), platform::to_void_cast(input->data<T>()));
+          x->mem_desc(), platform::to_void_cast(x->data<T>()));
 
-      auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
-          output, input->mem_desc(), ctx.GetPlace());
+      // reuse mem desc since it is a simple copy
+      auto reorder_dst_memory_p =
+          reorder_handler.AcquireDstMemory(out, x->mem_desc(), ctx.GetPlace());
 
       auto reorder_p = reorder_handler.AcquireReorder(reorder_src_memory_p,
                                                       reorder_dst_memory_p);
@@ -88,15 +89,15 @@ class ReduceMKLDNNKernel : public framework::OpKernel<T> {
       reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
       astream.wait();
 
-      output->set_mem_desc(reorder_dst_memory_p->get_desc().reshape(
-          phi::vectorize<int64_t>(output->dims())));
+      out->set_mem_desc(reorder_dst_memory_p->get_desc().reshape(
+          phi::vectorize<int64_t>(out->dims())));
     } else {
       platform::ReductionMKLDNNHandler<T> handler(reduction_type, 0.0f, 0.0f,
                                                   onednn_engine, ctx.GetPlace(),
-                                                  input, output, output_dims);
+                                                  x, out, out_tz);
 
-      auto src_memory_p = handler.AcquireSrcMemory(input);
-      auto dst_memory_p = handler.AcquireDstMemory(output);
+      auto src_memory_p = handler.AcquireSrcMemory(x);
+      auto dst_memory_p = handler.AcquireDstMemory(out);
 
       std::unordered_map<int, dnnl::memory> reduction_args = {
           {DNNL_ARG_SRC, *src_memory_p}, {DNNL_ARG_DST, *dst_memory_p}};
@@ -105,8 +106,9 @@ class ReduceMKLDNNKernel : public framework::OpKernel<T> {
 
       reduction_p->execute(astream, reduction_args);
       astream.wait();
-      output->set_mem_desc(dst_memory_p->get_desc().reshape(
-          phi::vectorize<int64_t>(output->dims())));
+
+      out->set_mem_desc(dst_memory_p->get_desc().reshape(
+          phi::vectorize<int64_t>(out->dims())));
     }
   }
 };
@@ -127,22 +129,15 @@ class ReduceGradMKLDNNKernel : public framework::OpKernel<T> {
     const auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
 
-    const auto input_dims =
-        CalculateReducedDims(dx, dout, dims, reduce_all, keep_dim);
-    const auto output_dims = phi::vectorize(dx->dims());
-
-    auto dout_mem_desc = dout->mem_desc();
-
-    if (input_dims != output_dims) {
-      dout_mem_desc = dout_mem_desc.reshape(input_dims);
-    }
+    auto dout_tz = CalculateReducedDims(dx, dout, dims, reduce_all, keep_dim);
+    auto dx_tz = phi::vectorize(dx->dims());
 
-    platform::BroadcastDataMKLDNNHandler<T> handler(
-        binary_type, onednn_engine, ctx.GetPlace(), dx, dout, scale_x, scale_y,
-        dout_mem_desc);
+    platform::BroadcastDataMKLDNNHandler<T> handler(binary_type, onednn_engine,
+                                                    ctx.GetPlace(), dout, dx,
+                                                    scale_x, scale_y, dout_tz);
 
     const auto src_memory_p = handler.AcquireSrcMemory(dout);
-    const auto dst_memory_p = handler.AcquireDstMemory(dx);
+    const auto dst_memory_p = handler.AcquireZeroedDstMemory(dx);
     const auto binary_prim = handler.AcquireForwardPrimitive();
 
     const std::unordered_map<int, dnnl::memory> args = {
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
index 04660fb501142..e3d8d15a305a9 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
@@ -112,6 +112,8 @@ class ReduceMaxGradNPUKernel : public framework::OpKernel<T> {
     auto* x = context.Input<Tensor>("X");
     auto* out = context.Input<Tensor>("Out");
     auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto reduce_dims = context.Attr<std::vector<int>>("dim");
+    bool reduce_all = context.Attr<bool>("reduce_all");
     int in_dtype = context.Attr<int>("in_dtype");
 
     PADDLE_ENFORCE_EQ(
@@ -129,12 +131,30 @@ class ReduceMaxGradNPUKernel : public framework::OpKernel<T> {
 
     // broadcast
     auto x_dims_vec = phi::vectorize(x->dims());
+    if (reduce_all) {
+      reduce_dims.clear();
+      for (size_t d = 0; d < x_dims_vec.size(); ++d) {
+        reduce_dims.push_back(static_cast<int>(d));
+      }
+    }
+
+    Tensor tmp_out, tmp_out_grad;
+    auto tmp_out_dims_vec = x_dims_vec;
+    for (auto d : reduce_dims) {
+      tmp_out_dims_vec[d] = 1;
+    }
+
+    tmp_out.ShareDataWith(*out);
+    tmp_out.Resize(phi::make_ddim(tmp_out_dims_vec));
+    tmp_out_grad.ShareDataWith(*out_grad);
+    tmp_out_grad.Resize(phi::make_ddim(tmp_out_dims_vec));
+
     Tensor transformed_out(x->type());
     transformed_out.Resize(phi::make_ddim(x_dims_vec));
     transformed_out.mutable_data<T>(place);
     NpuOpRunner r_brd_out;
     r_brd_out.SetType("BroadcastTo")
-        .AddInput(*out)
+        .AddInput(tmp_out)
         .AddInput(std::move(x_dims_vec))
         .AddOutput(transformed_out)
         .Run(stream);
@@ -143,7 +163,7 @@ class ReduceMaxGradNPUKernel : public framework::OpKernel<T> {
     transformed_out_grad.mutable_data<T>(place);
     NpuOpRunner r_brd_out_grad;
     r_brd_out_grad.SetType("BroadcastTo")
-        .AddInput(*out_grad)
+        .AddInput(tmp_out_grad)
         .AddInput(std::move(x_dims_vec))
         .AddOutput(transformed_out_grad)
         .Run(stream);
diff --git a/paddle/fluid/operators/roi_align_op_xpu.cc b/paddle/fluid/operators/roi_align_op_xpu.cc
index 13490d6fcde3a..7be1c19012099 100644
--- a/paddle/fluid/operators/roi_align_op_xpu.cc
+++ b/paddle/fluid/operators/roi_align_op_xpu.cc
@@ -37,7 +37,7 @@ class XPUROIAlignOpKernel : public framework::OpKernel<T> {
     auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
     auto aligned = ctx.Attr<bool>("aligned");
 
-    auto in_dims = in->dims();
+    const auto& in_dims = in->dims();
     int batch_size = in_dims[0];
     int channels = in_dims[1];
     int height = in_dims[2];
diff --git a/paddle/fluid/operators/rrelu_op.cc b/paddle/fluid/operators/rrelu_op.cc
new file mode 100644
index 0000000000000..c543a088e9d7f
--- /dev/null
+++ b/paddle/fluid/operators/rrelu_op.cc
@@ -0,0 +1,126 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class RReluOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class RReluOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input of RReLU op.");
+    AddOutput("Out", "The output of RReLU op.");
+    AddOutput("Noise", "The random sampled RReLU noise.")
+        .AsIntermediate()
+        .AsExtra();
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
+    float default_lower = 1. / 8.;
+    AddAttr<float>("lower", "Lower bound of the uniform distribution.")
+        .SetDefault(default_lower)
+        .AddCustomChecker([](const float& lower) {
+          PADDLE_ENFORCE_EQ(lower >= 0.0f && lower < 1.0f, true,
+                            platform::errors::InvalidArgument(
+                                "'RRelu_lower' must be between 0.0 and 1.0."));
+        });
+    float defalut_upper = 1. / 3.;
+    AddAttr<float>("upper", "Upper bound of the uniform distribution.")
+        .SetDefault(defalut_upper)
+        .AddCustomChecker([](const float& upper) {
+          PADDLE_ENFORCE_EQ(upper > 0.0f && upper <= 1.0f, true,
+                            platform::errors::InvalidArgument(
+                                "'RRelu_upper' must be between 0.0 and 1.0."));
+        });
+    AddComment(R"DOC(
+RReLU Operator.
+
+Applies the randomized leaky rectified liner unit function, element-wise,
+as described in the paper:
+
+`Empirical Evaluation of Rectified Activations in Convolutional Network`_.
+
+The function is defined as:
+
+.. math::
+    \text{RReLU}(x) =
+    \begin{cases}
+        x & \text{if } x \geq 0 \\
+        ax & \text{ otherwise }
+    \end{cases}
+
+where :math:`a` is randomly sampled from uniform distribution
+:math:`\mathcal{U}(\text{lower}, \text{upper})`.
+
+ See: https://arxiv.org/pdf/1505.00853.pdf
+
+)DOC");
+  }
+};
+
+class RReluGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+};
+
+template <typename T>
+class RReluGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("rrelu_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Noise", this->Output("Noise"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(rrelu, RReluInferShapeFunctor,
+                            PD_INFER_META(phi::RReluInferMeta));
+
+REGISTER_OPERATOR(rrelu, ops::RReluOp, ops::RReluOpMaker,
+                  ops::RReluGradOpMaker<paddle::framework::OpDesc>,
+                  ops::RReluGradOpMaker<paddle::imperative::OpBase>,
+                  RReluInferShapeFunctor);
+
+DECLARE_INFER_SHAPE_FUNCTOR(rrelu_grad, RReluGradInferShapeFunctor,
+                            PD_INFER_META(phi::RReluGradInferMeta));
+REGISTER_OPERATOR(rrelu_grad, ops::RReluGradOp, RReluGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/softmax_op_mlu.cc b/paddle/fluid/operators/softmax_op_mlu.cc
index 9cb698e94fc56..9b97e779f29ef 100644
--- a/paddle/fluid/operators/softmax_op_mlu.cc
+++ b/paddle/fluid/operators/softmax_op_mlu.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <cnnlSoftmaxAlgorithm_t softmax_algo, typename T>
 class SoftmaxMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -45,7 +45,7 @@ class SoftmaxMLUKernel : public framework::OpKernel<T> {
       regard_in_shape = {d1, d2, d3};
     }
 
-    static const cnnlSoftmaxAlgorithm_t algo = CNNL_SOFTMAX_ACCURATE;
+    static const cnnlSoftmaxAlgorithm_t algo = softmax_algo;
     MLUCnnlTensorDesc in_desc(cnnl_softmax_dims, regard_in_shape.data(),
                               ToCnnlDataType<T>());
     MLUCnnl::SoftmaxForward(ctx, algo, mode, NULL, in_desc.get(),
@@ -54,7 +54,7 @@ class SoftmaxMLUKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <cnnlSoftmaxAlgorithm_t softmax_algo, typename T>
 class SoftmaxGradMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -82,7 +82,7 @@ class SoftmaxGradMLUKernel : public framework::OpKernel<T> {
       regard_out_shape = {d1, d2, d3};
     }
 
-    static const cnnlSoftmaxAlgorithm_t algo = CNNL_SOFTMAX_ACCURATE;
+    static const cnnlSoftmaxAlgorithm_t algo = softmax_algo;
     MLUCnnlTensorDesc out_desc(cnnl_softmax_dims, regard_out_shape.data(),
                                ToCnnlDataType<T>());
     MLUCnnl::SoftmaxBackward(ctx, algo, mode, out_desc.get(), GetBasePtr(out),
@@ -97,7 +97,16 @@ class SoftmaxGradMLUKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_MLU_KERNEL(softmax, ops::SoftmaxMLUKernel<float>,
-                       ops::SoftmaxMLUKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(softmax_grad, ops::SoftmaxGradMLUKernel<float>,
-                       ops::SoftmaxGradMLUKernel<paddle::platform::float16>);
+REGISTER_OP_MLU_KERNEL(
+    softmax, ops::SoftmaxMLUKernel<CNNL_SOFTMAX_ACCURATE, float>,
+    ops::SoftmaxMLUKernel<CNNL_SOFTMAX_ACCURATE, plat::float16>);
+REGISTER_OP_MLU_KERNEL(softmax_grad,
+                       ops::SoftmaxGradMLUKernel<CNNL_SOFTMAX_ACCURATE, float>,
+                       ops::SoftmaxGradMLUKernel<CNNL_SOFTMAX_ACCURATE,
+                                                 paddle::platform::float16>);
+REGISTER_OP_MLU_KERNEL(
+    log_softmax, ops::SoftmaxMLUKernel<CNNL_SOFTMAX_LOG, float>,
+    ops::SoftmaxMLUKernel<CNNL_SOFTMAX_ACCURATE, plat::float16>);
+REGISTER_OP_MLU_KERNEL(
+    log_softmax_grad, ops::SoftmaxGradMLUKernel<CNNL_SOFTMAX_LOG, float>,
+    ops::SoftmaxGradMLUKernel<CNNL_SOFTMAX_LOG, paddle::platform::float16>);
diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h
index ae846f4cae6fb..3e27402c86947 100644
--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
@@ -151,12 +151,6 @@ void UniformRandom(const framework::ExecutionContext& context,
   T* data = tensor->mutable_data<T>(dev_cxt.GetPlace());
   if (size <= 0) return;
   unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-  bool seed_flag = false;
-  if (seed == 0) {
-    std::random_device rd;
-    seed = rd();
-    seed_flag = true;
-  }
 
   T min = static_cast<T>(context.Attr<float>("min"));
   T max = static_cast<T>(context.Attr<float>("max"));
@@ -165,14 +159,15 @@ void UniformRandom(const framework::ExecutionContext& context,
   unsigned int diag_step =
       static_cast<unsigned int>(context.Attr<int>("diag_step"));
   T diag_val = static_cast<T>(context.Attr<float>("diag_val"));
-  int device_id = context.GetPlace().GetDeviceId();
-  auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
-  if (gen_cuda->GetIsInitPy() && seed_flag) {
+
+  if (seed == 0) {
+    // Use global Generator seed
     using MT = typename details::MPTypeTrait<T>::Type;
     phi::funcs::uniform_distribution<MT> dist;
     phi::funcs::uniform_real_transform<MT> trans(min, max);
     phi::funcs::distribution_and_transform<T>(dev_cxt, tensor, dist, trans);
   } else {
+    // Use OP seed
     auto func =
         UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val);
     phi::IndexKernel<T, UniformGenerator<T>>(dev_cxt, tensor, func);
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
index 6da5d1244fbed..8c04e935134c7 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -149,8 +149,8 @@ class RecordedGpuMallocHelper {
     if (FLAGS_enable_gpu_memory_usage_log) {
       // A fake UPDATE to trigger the construction of memory stat instances,
       // make sure that they are destructed after RecordedGpuMallocHelper.
-      MEMORY_STAT_UPDATE(Reserved, dev_id, 0);
-      MEMORY_STAT_UPDATE(Allocated, dev_id, 0);
+      DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id, 0);
+      DEVICE_MEMORY_STAT_UPDATE(Allocated, dev_id, 0);
     }
   }
 
@@ -161,15 +161,18 @@ class RecordedGpuMallocHelper {
     if (FLAGS_enable_gpu_memory_usage_log) {
       if (FLAGS_enable_gpu_memory_usage_log_mb) {
         std::cout << "[Memory Usage (MB)] gpu " << dev_id_ << " : Reserved = "
-                  << MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_) / 1048576.0
+                  << DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_) /
+                         1048576.0
                   << ", Allocated = "
-                  << MEMORY_STAT_PEAK_VALUE(Allocated, dev_id_) / 1048576.0
+                  << DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, dev_id_) /
+                         1048576.0
                   << std::endl;
       } else {
         std::cout << "[Memory Usage (Byte)] gpu " << dev_id_ << " : Reserved = "
-                  << MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_)
+                  << DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_)
                   << ", Allocated = "
-                  << MEMORY_STAT_PEAK_VALUE(Allocated, dev_id_) << std::endl;
+                  << DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, dev_id_)
+                  << std::endl;
       }
     }
   }
@@ -222,15 +225,15 @@ class RecordedGpuMallocHelper {
     if (UNLIKELY(malloc_managed_memory)) {
       result = cudaMallocManaged(ptr, size);
     } else {
-      VLOG(10) << "[cudaMalloc] size=" << static_cast<double>(size) / (1 << 20)
-               << " MB";
       result = cudaMalloc(ptr, size);
+      VLOG(10) << "[cudaMalloc] size=" << static_cast<double>(size) / (1 << 20)
+               << " MB, result=" << result;
     }
 #endif
     if (result == gpuSuccess) {
       cur_size_.fetch_add(size);
       STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
-      MEMORY_STAT_UPDATE(Reserved, dev_id_, size);
+      DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, size);
 
 #ifdef PADDLE_WITH_TESTING
       gpu_ptrs.insert(*ptr);
@@ -269,7 +272,7 @@ class RecordedGpuMallocHelper {
       PADDLE_ENFORCE_GPU_SUCCESS(err);
       cur_size_.fetch_sub(size);
       STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
-      MEMORY_STAT_UPDATE(Reserved, dev_id_, -size);
+      DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, -size);
     } else {
       platform::GpuGetLastError();  // clear the error flag when
                                     // cudaErrorCudartUnloading /
diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.cc b/paddle/fluid/platform/device/ipu/ipu_backend.cc
index 0871624a5d749..9e960a99123c0 100644
--- a/paddle/fluid/platform/device/ipu/ipu_backend.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_backend.cc
@@ -74,11 +74,7 @@ void IpuBackend::WeightsToHost() { executor_->WeightsToHost(); }
 
 void IpuBackend::Detach() { executor_->Detach(); }
 
-void IpuBackend::Reset() {
-  executor_->Detach();
-  compiler_.reset();
-  executor_.reset();
-}
+void IpuBackend::Reset() { executor_->Reset(); }
 
 void IpuBackend::SetScope(const framework::Scope& scope) {
   scope_ = &scope;
diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.cc b/paddle/fluid/platform/device/ipu/ipu_executor.cc
index 4f15ecf3babf2..d490334ee33f5 100644
--- a/paddle/fluid/platform/device/ipu/ipu_executor.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_executor.cc
@@ -88,11 +88,7 @@ class PdIArray final : public popart::IArray {
 
 }  // namespace
 
-Executor::~Executor() {
-  Detach();
-  session_.reset();
-  executor_resources_.reset();
-}
+Executor::~Executor() { Reset(); }
 
 void Executor::Prepare(const std::string &proto) {
   VLOG(10) << "enter Executor::Prepare";
@@ -299,6 +295,12 @@ void Executor::Detach() {
   }
 }
 
+void Executor::Reset() {
+  Detach();
+  session_.reset();
+  executor_resources_.reset();
+}
+
 void Executor::SetWeightsIO() {
   auto opt_type = compiler_resources_->optimizer_type;
   VLOG(10) << "SetWeightsIO for " << opt_type;
diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.h b/paddle/fluid/platform/device/ipu/ipu_executor.h
index 70c9477e69bab..1a46ebc69b197 100644
--- a/paddle/fluid/platform/device/ipu/ipu_executor.h
+++ b/paddle/fluid/platform/device/ipu/ipu_executor.h
@@ -63,6 +63,9 @@ class Executor {
   // Detach IPU
   void Detach();
 
+  // Reset session
+  void Reset();
+
   // Scope
   void SetScope(const Scope *scope) { scope_ = scope; }
 
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
index ab68ebf3a5448..778c18146d64d 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
@@ -109,6 +109,8 @@ XPUOpMap& get_kp_ops() {
       {"reduce_any", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace())})},
       {"pull_box_sparse",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"push_box_sparse",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"reduce_amax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"reduce_amin", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
   };
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 904e4854ba6b4..09a29c3429cba 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -169,7 +169,7 @@ inline void EmplaceDeviceContext(
 
           cuda_ctx->PartialInitWithAllocator();
           dev_ctx->SetGenerator(
-              framework::GetDefaultCUDAGenerator(p.GetDeviceId()).get());
+              framework::DefaultCUDAGenerator(p.GetDeviceId()).get());
 #endif
         } else {
           dev_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance()
@@ -750,7 +750,7 @@ dnnl::stream& MKLDNNDeviceContextThreadLocals::Body::get_stream(void) {
 void MKLDNNDeviceContext::ResetBlobMap(void* ptr) {
   VLOG(4) << tls().get_curr_exec() << " " << ptr;
   std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
-  if (!block_next_cache_clearing_) {
+  if (block_next_cache_clearing_ == 0) {
     VLOG(3) << "Clearing DNNL cache.";
     // If no specific executor pointer then clear
     // everything. For executor pointer then clear only
@@ -768,9 +768,20 @@ void MKLDNNDeviceContext::ResetBlobMap(void* ptr) {
         s.second->erase(ptr);
       }
     }
+    // Reset paddle layout to NCHW
+    VLOG(3) << "Resetting Paddle data layout to NCHW.";
+    platform::MKLDNNDeviceContext::tls().set_cur_paddle_data_layout(
+        paddle::framework::DataLayout::kNCHW);
   } else {
-    VLOG(3) << "Prevented Clearing DNNL cache.";
-    block_next_cache_clearing_ = false;
+    --block_next_cache_clearing_;
+    VLOG(3) << "Prevented Clearing DNNL cache. Updated "
+               "block_next_cache_clearing_ : "
+            << block_next_cache_clearing_;
+    PADDLE_ENFORCE_GE(block_next_cache_clearing_, 0,
+                      platform::errors::InvalidArgument(
+                          "Cache clearing mark should be non-negative "
+                          ". But received %d.",
+                          block_next_cache_clearing_));
   }
 }
 
@@ -796,8 +807,10 @@ void MKLDNNDeviceContext::LinkEntryWithExecutor(BlobPtr_t<KeyBlob> pblob,
 
 void MKLDNNDeviceContext::BlockNextCacheClearing() {
   std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
-  VLOG(3) << "Next DNNL cache clearing has been blocked.";
-  block_next_cache_clearing_ = true;
+  ++block_next_cache_clearing_;
+  VLOG(3) << "Next DNNL cache clearing has been blocked. Updated "
+             "block_next_cache_clearing_ : "
+          << block_next_cache_clearing_;
 }
 
 size_t MKLDNNDeviceContext::GetShapeBlobSize() const {
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 2b53ecf86a641..a63d41405f1b2 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -850,7 +850,8 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
   // to erase
   std::shared_ptr<ExecShape> p_exec_items_;
   std::shared_ptr<std::mutex> p_mutex_;
-  bool block_next_cache_clearing_ = false;
+  // 0 - clearing is allowed. x > 0 do not clear.
+  unsigned int block_next_cache_clearing_ = 0;
 };
 #endif
 
diff --git a/paddle/fluid/platform/device_memory_aligment.cc b/paddle/fluid/platform/device_memory_aligment.cc
index 8261c866d073d..e8a6051c19f2d 100644
--- a/paddle/fluid/platform/device_memory_aligment.cc
+++ b/paddle/fluid/platform/device_memory_aligment.cc
@@ -31,9 +31,11 @@ size_t Alignment(size_t size, const platform::Place &place, int align_size) {
       alignment = alignment;
 #elif defined(PADDLE_WITH_ASCEND_CL)
       alignment = NPUMinChunkSize();
+#elif defined(PADDLE_WITH_MLU)
+      alignment = MLUMinChunkSize();
 #else
       PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "Fluid is not compiled with CUDA/XPU/NPU."));
+          "Fluid is not compiled with CUDA/XPU/NPU/MLU."));
 #endif
     }
   }
diff --git a/paddle/fluid/platform/device_memory_aligment.h b/paddle/fluid/platform/device_memory_aligment.h
index a3f88592b7649..ee37b93807eaa 100644
--- a/paddle/fluid/platform/device_memory_aligment.h
+++ b/paddle/fluid/platform/device_memory_aligment.h
@@ -21,6 +21,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/npu/npu_info.h"
 #endif
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#ifdef PADDLE_WITH_MLU
+#include "paddle/fluid/platform/device/mlu/mlu_info.h"
+#endif
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index 600a4cbcc3ed9..2fcc573456d42 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -848,3 +848,16 @@ PADDLE_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait");
  * Example:
  */
 PADDLE_DEFINE_EXPORTED_bool(use_autotune, false, "Whether enable autotune.");
+
+/**
+ * Preformance related FLAG
+ * Name: einsum_opt
+ * Since Version: 2.3.0
+ * Value Range: bool, default=false
+ * Example:
+ * Note: If True, EinsumOp will be optimimzed by innercache reuse, which
+ * uses more gpu memory.
+ */
+PADDLE_DEFINE_EXPORTED_bool(
+    einsum_opt, false,
+    "EinsumOp backward will be speedup at the expense of more gpu memory.");
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index 94c0124440ea9..5e77046962931 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -148,8 +148,6 @@ inline void ClearMKLDNNCache(const platform::Place& place,
     platform::MKLDNNDeviceContext* dev_ctx =
         (platform::MKLDNNDeviceContext*)pool.Get(place);
     dev_ctx->ResetBlobMap(ptr);
-    platform::MKLDNNDeviceContext::tls().set_cur_paddle_data_layout(
-        paddle::framework::DataLayout::kNCHW);
   }
 }
 
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 13b5005a30fa0..5476d244f6035 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -616,29 +616,17 @@ class BinaryMKLDNNHandler
  public:
   BinaryMKLDNNHandler(const dnnl::algorithm algo, const int axis,
                       const dnnl::engine engine, platform::Place cpu_place,
-                      const Tensor* x, const Tensor* y, Tensor* z,
-                      float scale_x, float scale_y, float scale_z,
+                      const Tensor* x, const Tensor* y, Tensor* out,
+                      float scale_x, float scale_y, float scale_out,
                       const dnnl::post_ops& post_ops = dnnl::post_ops{})
       : platform::MKLDNNHandlerNoCachingT<T, dnnl::binary>(engine, cpu_place) {
-    PADDLE_ENFORCE_EQ(
-        x->layout(), DataLayout::kMKLDNN,
-        platform::errors::InvalidArgument(
-            "Wrong layout set for X tensor. Expected: %d (kMKLDNN), Actual: %d",
-            DataLayout::kMKLDNN, x->layout()));
-
-    PADDLE_ENFORCE_EQ(
-        y->layout(), DataLayout::kMKLDNN,
-        platform::errors::InvalidArgument(
-            "Wrong layout set for Y tensor. Expected: %d (kMKLDNN), Actual: %d",
-            DataLayout::kMKLDNN, y->layout()));
-
     const auto src_x_tz = phi::vectorize(x->dims());
     const auto src_y_tz = phi::vectorize(y->dims());
     // if output tensor(z) is nullptr then we are computing into oneDNN
     // managed buffer
     auto rankdiff = x->dims().size() - y->dims().size();
-    const auto dst_tz = (z == nullptr) ? (rankdiff > 0 ? src_x_tz : src_y_tz)
-                                       : phi::vectorize(z->dims());
+    const auto dst_tz = (out == nullptr) ? (rankdiff > 0 ? src_x_tz : src_y_tz)
+                                         : phi::vectorize(out->dims());
 
     auto src0_md = x->mem_desc();
     auto src1_md = y->mem_desc();
@@ -667,7 +655,7 @@ class BinaryMKLDNNHandler
                                      MKLDNNMemoryFormat::any);
 
     auto attributes =
-        CreateAttributes(algo, scale_x, scale_y, scale_z, post_ops);
+        CreateAttributes(algo, scale_x, scale_y, scale_out, post_ops);
 
     this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md, src1_md,
                                             dst_md);
@@ -681,7 +669,7 @@ class BinaryMKLDNNHandler
 
  private:
   static inline dnnl::primitive_attr CreateAttributes(
-      dnnl::algorithm op, float scale_x, float scale_y, float scale_z,
+      dnnl::algorithm op, float scale_x, float scale_y, float scale_out,
       dnnl::post_ops post_ops = dnnl::post_ops{}) {
     // Scales set in attributes for inputs contibute to the output equation
     // in the following way (assuming no broadcasting takes place):
@@ -699,9 +687,9 @@ class BinaryMKLDNNHandler
     // For mul operation on the other hand
     // output = (scale_out / scale_x) * x * (1.0 / scale_y) * y
     //                <scale_0>                 <scale_1>
-    float scale_0 = scale_z / scale_x;
+    float scale_0 = scale_out / scale_x;
     float scale_1 =
-        op == dnnl::algorithm::binary_add ? scale_z / scale_y : 1.0 / scale_y;
+        op == dnnl::algorithm::binary_add ? scale_out / scale_y : 1.0 / scale_y;
     dnnl::primitive_attr attributes;
     attributes.set_scales(/* input_x_id = */ DNNL_ARG_SRC_0, /* mask = */ 0,
                           {scale_0});
@@ -718,21 +706,15 @@ class BroadcastDataMKLDNNHandler
  public:
   BroadcastDataMKLDNNHandler(const dnnl::algorithm algo,
                              const dnnl::engine engine,
-                             platform::Place cpu_place, const Tensor* out,
-                             const Tensor* x, float scale_x, float scale_y,
-                             const dnnl::memory::desc& x_mem_desc)
+                             platform::Place cpu_place, const Tensor* x,
+                             Tensor* out, float scale_x, float scale_y,
+                             const std::vector<int64_t>& extended_x_dims)
       : platform::MKLDNNHandlerNoCachingT<T, dnnl::binary>(engine, cpu_place) {
-    PADDLE_ENFORCE_EQ(
-        x->layout(), DataLayout::kMKLDNN,
-        platform::errors::InvalidArgument("Wrong layout set for X tensor."));
-
     const auto src0_tz = phi::vectorize(out->dims());
-
     const auto src0_md =
         dnnl::memory::desc(src0_tz, platform::MKLDNNGetDataType<T>(),
                            platform::GetPlainMKLDNNFormat(src0_tz.size()));
-
-    const auto src1_md = x_mem_desc;
+    const auto src1_md = x->mem_desc().reshape(extended_x_dims);
 
     dnnl::primitive_attr attributes;
     attributes.set_scales(DNNL_ARG_SRC_0, 0, {scale_x});
@@ -743,9 +725,9 @@ class BroadcastDataMKLDNNHandler
   }
 
   template <typename T_out = T>
-  std::shared_ptr<dnnl::memory> AcquireDstMemory(framework::Tensor* output) {
-    T_out* ptr = output->mutable_data<T_out>(
-        this->place_, this->fwd_pd_->dst_desc().get_size());
+  std::shared_ptr<dnnl::memory> AcquireZeroedDstMemory(framework::Tensor* out) {
+    T_out* ptr = out->mutable_data<T_out>(this->place_,
+                                          this->fwd_pd_->dst_desc().get_size());
     memset(ptr, 0, this->fwd_pd_->dst_desc().get_size());
     return this->AcquireMemoryFromPrimitive(this->fwd_pd_->dst_desc(), ptr);
   }
@@ -758,22 +740,18 @@ class ReductionMKLDNNHandler
   ReductionMKLDNNHandler(const dnnl::algorithm algo, const float p,
                          const float eps, const dnnl::engine engine,
                          platform::Place cpu_place, const Tensor* x,
-                         const Tensor* y, std::vector<int64_t> y_tz,
-                         const dnnl::primitive_attr& attr = NULL)
+                         const Tensor* out, std::vector<int64_t> out_tz,
+                         const dnnl::primitive_attr& attrs = NULL)
       : platform::MKLDNNHandlerNoCachingT<T, dnnl::reduction>(engine,
                                                               cpu_place) {
-    PADDLE_ENFORCE_EQ(
-        x->layout(), DataLayout::kMKLDNN,
-        platform::errors::InvalidArgument("Wrong layout set for X tensor."));
-
-    const auto y_md = memory::desc(y_tz, platform::MKLDNNGetDataType<T>(),
-                                   dnnl::memory::format_tag::any);
+    const auto out_md = memory::desc(out_tz, platform::MKLDNNGetDataType<T>(),
+                                     dnnl::memory::format_tag::any);
 
-    if (attr)
-      this->AcquireForwardPrimitiveDescriptor(attr, algo, x->mem_desc(), y_md,
-                                              p, eps);
+    if (attrs)
+      this->AcquireForwardPrimitiveDescriptor(attrs, algo, x->mem_desc(),
+                                              out_md, p, eps);
     else
-      this->AcquireForwardPrimitiveDescriptor(algo, x->mem_desc(), y_md, p,
+      this->AcquireForwardPrimitiveDescriptor(algo, x->mem_desc(), out_md, p,
                                               eps);
   }
 };
diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h
index 24c515f5b4956..f64e05504aa3f 100644
--- a/paddle/fluid/platform/profiler_helper.h
+++ b/paddle/fluid/platform/profiler_helper.h
@@ -168,8 +168,10 @@ void PrintMemProfiler(
   if (num_gpus > 0) {
     std::cout << "GPU Memory Usage (MB):\n";
     for (int dev_id = 0; dev_id < num_gpus; ++dev_id) {
-      int64_t allocated = memory::StatGetCurrentValue("Allocated", dev_id);
-      int64_t reserved = memory::StatGetCurrentValue("Reserved", dev_id);
+      int64_t allocated =
+          memory::DeviceMemoryStatCurrentValue("Allocated", dev_id);
+      int64_t reserved =
+          memory::DeviceMemoryStatCurrentValue("Reserved", dev_id);
       size_t available = 0, total = 0, actual_available = 0, actual_total = 0;
       RecordedGpuMemGetInfo(&available, &total, &actual_available,
                             &actual_total, dev_id);
diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index 530cc6992d391..c1b26ee0b792d 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -34,7 +34,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/python_headers.h"
 #include "paddle/fluid/pybind/eager_op_function_impl.h"
 #include "paddle/fluid/pybind/tensor_py.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
 #include "paddle/phi/core/string_tensor.h"
 namespace paddle {
diff --git a/paddle/fluid/pybind/eager_custom_python_api.h b/paddle/fluid/pybind/eager_custom_python_api.h
index c509ab5674930..99ec4212918de 100644
--- a/paddle/fluid/pybind/eager_custom_python_api.h
+++ b/paddle/fluid/pybind/eager_custom_python_api.h
@@ -65,7 +65,7 @@ static PyObject *eager_api_final_state_linear(PyObject *self, PyObject *args,
     if (bias.initialized()) {
       auto mm_out =
           matmul_final_state_dygraph_function(x, weight, false, false);
-      auto out = add_final_state_dygraph_function(bias, mm_out);
+      auto out = add_final_state_dygraph_function(mm_out, bias);
       PyEval_RestoreThread(tstate);
       tstate = nullptr;
       return ToPyObject(out);
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index 5395b4f31c83b..628e808ef99ac 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -45,7 +45,6 @@ typedef SSIZE_T ssize_t;
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/phi/api/ext/op_meta_info.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 1a0838d7f47c6..b54f4e1416c35 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -494,7 +494,8 @@ static PyObject* tensor_clear_gradient(TensorObject* self, PyObject* args,
   }
 
   paddle::experimental::Tensor* grad;
-  if (egr::egr_utils_api::IsLeafTensor(self->tensor)) {
+  bool is_leaf = egr::egr_utils_api::IsLeafTensor(self->tensor);
+  if (is_leaf) {
     grad = egr::EagerUtils::mutable_grad(self->tensor);
     PADDLE_ENFORCE(grad != nullptr,
                    paddle::platform::errors::Fatal(
@@ -518,6 +519,11 @@ static PyObject* tensor_clear_gradient(TensorObject* self, PyObject* args,
       if (grad->initialized()) {
         if (set_to_zero) {
           grad->set_impl(paddle::experimental::zeros_like(*grad).impl());
+          if (is_leaf) {
+            std::static_pointer_cast<egr::GradNodeAccumulation>(
+                egr::EagerUtils::grad_node(self->tensor))
+                ->SetFakeEmpty(true);
+          }
         } else {
           VLOG(4) << "Gradient of " << self->tensor.name()
                   << " is initialized, will be released.";
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 4707f757d8bfb..efa0fe2cb582e 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -765,7 +765,7 @@ PyObject* ToPyObject(const std::unordered_map<std::wstring, int>& value) {
 
 // For Final State Dygraph,
 // We directly use paddle::optional(Tensor) as dispensable Tensor
-paddle::optional<const paddle::experimental::Tensor&> GetOptionalTensorFromArgs(
+paddle::optional<paddle::experimental::Tensor> GetOptionalTensorFromArgs(
     const std::string& op_type, const std::string& arg_name, PyObject* args,
     ssize_t arg_idx, bool dispensable) {
   PyObject* obj = PyTuple_GET_ITEM(args, arg_idx);
@@ -784,7 +784,7 @@ paddle::optional<const paddle::experimental::Tensor&> GetOptionalTensorFromArgs(
   }
 
   if (PyObject_IsInstance(obj, reinterpret_cast<PyObject*>(p_tensor_type))) {
-    return paddle::make_optional<const paddle::experimental::Tensor&>(
+    return paddle::make_optional<paddle::experimental::Tensor>(
         reinterpret_cast<TensorObject*>(obj)->tensor);
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index c8e1cd4ad0b75..7f94f6c90e5a0 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -185,7 +185,7 @@ paddle::Place CastPyArg2Place(PyObject* obj, const std::string& op_type,
 paddle::DataType CastPyArg2DataType(PyObject* obj, const std::string& op_type,
                                     ssize_t arg_pos);
 
-paddle::optional<const paddle::experimental::Tensor&> GetOptionalTensorFromArgs(
+paddle::optional<paddle::experimental::Tensor> GetOptionalTensorFromArgs(
     const std::string& op_type, const std::string& arg_name, PyObject* args,
     ssize_t arg_idx, bool dispensable = false);
 
diff --git a/paddle/fluid/pybind/generator_py.cc b/paddle/fluid/pybind/generator_py.cc
index 53379373d2518..6bb85da8c466f 100644
--- a/paddle/fluid/pybind/generator_py.cc
+++ b/paddle/fluid/pybind/generator_py.cc
@@ -55,13 +55,9 @@ void BindGenerator(py::module* m_ptr) {
            })
       .def("seed", &framework::Generator::Seed)
       .def("initial_seed", &framework::Generator::GetCurrentSeed)
-      .def("random", &framework::Generator::Random64)
-      //  .def("get_cpu_engine", &framework::Generator::GetCPUEngine)
-      //  .def("set_cpu_engine", &framework::Generator::SetCPUEngine)
-      .def_property("_is_init_py", &framework::Generator::GetIsInitPy,
-                    &framework::Generator::SetIsInitPy);
+      .def("random", &framework::Generator::Random64);
   m.def("default_cpu_generator", &framework::DefaultCPUGenerator);
-  m.def("default_cuda_generator", &framework::GetDefaultCUDAGenerator);
+  m.def("default_cuda_generator", &framework::DefaultCUDAGenerator);
   m.def("set_random_seed_generator", &framework::SetRandomSeedGenerator);
   m.def("get_random_seed_generator", &framework::GetRandomSeedGenerator);
 }
diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h
index 2b849968c76f9..972e8aafab758 100644
--- a/paddle/fluid/pybind/op_function_generator.h
+++ b/paddle/fluid/pybind/op_function_generator.h
@@ -32,10 +32,16 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"fused_attention",
      {"X", "LnScale", "LnBias", "QKVW", "QKVBias", "CacheKV", "SrcMask",
       "OutLinearW", "OutLinearBias", "Ln2Scale", "Ln2Bias"}},
+    {"fused_gate_attention",
+     {"Query", "Key", "QueryWeight", "KeyWeight", "ValueWeight", "QKVWeight",
+      "NonbatchedBias", "SrcMask", "GateWeight", "GateBias", "OutLinearWeight",
+      "OutLinearBias"}},
     {"fused_multi_transformer",
      {"X", "LnScale", "LnBias", "QKVW", "QKVBias", "CacheKV", "TimeStep",
       "SrcMask", "OutLinearW", "OutLinearBias", "FFNLnScale", "FFNLnBias",
       "FFN1Weight", "FFN1Bias", "FFN2Weight", "FFN2Bias"}},
+    {"fused_bias_dropout_residual_layer_norm",
+     {"X", "Residual", "Bias", "LnScale", "LnBias"}},
     {"instance_norm", {"X", "Scale", "Bias"}},
     {"gru_unit", {"Input", "HiddenPrev", "Weight", "Bias"}},
     {"label_smooth", {"X", "PriorDist"}},
@@ -119,6 +125,11 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
      {"X", "Scale", "Bias", "Mean", "Variance", "MomentumTensor"}},
     {"inplace_abn",
      {"X", "Scale", "Bias", "Mean", "Variance", "MomentumTensor"}},
+    {"linear_interp", {"X", "OutSize"}},
+    {"bilinear_interp", {"X", "OutSize"}},
+    {"trilinear_interp", {"X", "OutSize"}},
+    {"nearest_interp", {"X", "OutSize"}},
+    {"bicubic_interp", {"X", "OutSize"}},
 };
 
 // NOTE(zhiqiu): Like op_ins_map.
@@ -148,6 +159,11 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
                          "DropoutMaskOut", "Ln2Mean",
                          "Ln2Variance",    "BiasDropoutResidualOut",
                          "CacheKVOut",     "Y"}},
+    {"fused_bias_dropout_residual_layer_norm",
+     {"BiasDropoutResidualOut", "DropoutMaskOut", "LnMean", "LnVariance", "Y"}},
+    {"fused_gate_attention",
+     {"QueryTransposeOut", "KeyTransposeOut", "ValueTransposeOut",
+      "QKVTransposeOut", "SoftmaxOut", "FMHAOut", "GateOut", "Out"}},
     {"sync_batch_norm",
      {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance",
       "ReserveSpace"}},
@@ -259,6 +275,7 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
     {"split", {"Out"}},
     {"concat", {"Out"}},
     {"fused_multi_transformer", {"CacheKVOut"}},
+    {"group_norm", {"Mean", "Variance"}},
 };
 
 // NOTE(pangyoki): Tensor View Strategy.
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index f6be9b66d5dbd..0e1271c1fe07f 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -3005,8 +3005,9 @@ All parameter, weight, gradient are variables in Paddle.
     }
     return stats_map;
   });
-  m.def("memory_stat_get_current", memory::StatGetCurrentValue);
-  m.def("memory_stat_get_peak", memory::StatGetPeakValue);
+  m.def("device_memory_stat_current_value",
+        memory::DeviceMemoryStatCurrentValue);
+  m.def("device_memory_stat_peak_value", memory::DeviceMemoryStatPeakValue);
   m.def("run_cmd",
         [](const std::string &cmd, int time_out = -1,
            int sleep_inter = -1) -> const std::string {
diff --git a/paddle/infrt/tensor/phi/tensor_map.cc b/paddle/infrt/tensor/phi/tensor_map.cc
index 7690322aed4a3..afac7175caf4f 100644
--- a/paddle/infrt/tensor/phi/tensor_map.cc
+++ b/paddle/infrt/tensor/phi/tensor_map.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include "paddle/infrt/tensor/phi/tensor_map.h"
+
+#include "glog/logging.h"
 #include "llvm/Support/ErrorHandling.h"
 
 namespace infrt {
diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt
index 7d28e3d27c496..004ed8de520d9 100644
--- a/paddle/phi/api/lib/CMakeLists.txt
+++ b/paddle/phi/api/lib/CMakeLists.txt
@@ -146,7 +146,7 @@ elseif(EXISTS "${generated_op_path}.tmp")
   execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${generated_op_path}.tmp" "${generated_op_path}")
   message("copy ${generated_op_path}.tmp ${generated_op_path}")
 else()
-  execute_process(COMMAND ${CMAKE_COMMAND} -E rm -f "${generated_op_path}")
+  execute_process(COMMAND ${CMAKE_COMMAND} -E remove -f "${generated_op_path}")
   message("remove ${generated_op_path}")
 endif()
 
@@ -158,7 +158,7 @@ elseif(EXISTS "${generated_argument_mapping_path}.tmp")
   execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${generated_argument_mapping_path}.tmp" "${generated_argument_mapping_path}")
   message("copy ${generated_argument_mapping_path}.tmp ${generated_argument_mapping_path}")
 else()
-  execute_process(COMMAND ${CMAKE_COMMAND} -E rm -f "${generated_argument_mapping_path}")
+  execute_process(COMMAND ${CMAKE_COMMAND} -E remove -f "${generated_argument_mapping_path}")
   message("remove ${generated_argument_mapping_path}")
 endif()
 
diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc
index d80444e7f710c..3ef7763d57e8b 100644
--- a/paddle/phi/api/lib/api_custom_impl.cc
+++ b/paddle/phi/api/lib/api_custom_impl.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/tensor_copy.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/common/type_traits.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -42,8 +41,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> adam_impl(
     const Tensor& moment2,
     const Tensor& beta1_pow,
     const Tensor& beta2_pow,
-    paddle::optional<const Tensor&> master_param,
-    paddle::optional<const Tensor&> skip_update,
+    const paddle::optional<Tensor>& master_param,
+    const paddle::optional<Tensor>& skip_update,
     const Scalar& beta1,
     const Scalar& beta2,
     const Scalar& epsilon,
@@ -88,11 +87,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> adam_impl(
   auto input_moment2 = PrepareData(moment2, kernel.InputAt(4), {});
   auto input_beta1_pow = PrepareData(beta1_pow, kernel.InputAt(5), {});
   auto input_beta2_pow = PrepareData(beta2_pow, kernel.InputAt(6), {});
-  paddle::optional<const phi::DenseTensor&> input_master_param(paddle::none);
-  auto input_master_param_ptr =
-      PrepareData(master_param, kernel.InputAt(7), {});
-  paddle::optional<const phi::DenseTensor&> input_skip_update(paddle::none);
-  auto input_skip_update_ptr = PrepareData(skip_update, kernel.InputAt(8), {});
+  auto input_master_param = PrepareData(master_param, kernel.InputAt(7), {});
+  auto input_skip_update = PrepareData(skip_update, kernel.InputAt(8), {});
 
   std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> api_output;
   auto kernel_out_0 = input_param.get();
@@ -101,40 +97,13 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> adam_impl(
   auto kernel_out_3 = input_beta1_pow.get();
   auto kernel_out_4 = input_beta2_pow.get();
   phi::DenseTensor* kernel_out_5 = nullptr;
-  if (input_master_param_ptr) {
-    input_master_param =
-        paddle::make_optional<const phi::DenseTensor&>(*input_master_param_ptr);
-    kernel_out_5 =
-        paddle::make_optional<phi::DenseTensor&>(*input_master_param_ptr)
-            .get_ptr();
+  if (input_master_param) {
+    kernel_out_5 = input_master_param.get_ptr();
   }
 
-  if (input_skip_update_ptr) {
-    input_skip_update =
-        paddle::make_optional<const phi::DenseTensor&>(*input_skip_update_ptr);
-  }
+  auto input_meta_ref_master_param = MakeMetaTensor(input_master_param);
 
-  paddle::optional<const phi::MetaTensor&> input_meta_ref_master_param(
-      paddle::none);
-  phi::DenseTensor dt;
-  phi::MetaTensor input_meta_tmp_master_param(dt);
-  if (input_master_param_ptr) {
-    input_meta_tmp_master_param.set_dtype(input_master_param_ptr->dtype());
-    input_meta_tmp_master_param.set_dims(input_master_param_ptr->dims());
-    input_meta_tmp_master_param.set_layout(input_master_param_ptr->layout());
-    input_meta_ref_master_param = input_meta_tmp_master_param;
-  }
-
-  paddle::optional<const phi::MetaTensor&> input_meta_ref_skip_update(
-      paddle::none);
-  phi::DenseTensor dt1;
-  phi::MetaTensor input_meta_tmp_skip_update(dt1);
-  if (input_skip_update_ptr) {
-    input_meta_tmp_skip_update.set_dtype(input_skip_update_ptr->dtype());
-    input_meta_tmp_skip_update.set_dims(input_skip_update_ptr->dims());
-    input_meta_tmp_skip_update.set_layout(input_skip_update_ptr->layout());
-    input_meta_ref_skip_update = input_meta_tmp_skip_update;
-  }
+  auto input_meta_ref_skip_update = MakeMetaTensor(input_skip_update);
 
   phi::MetaTensor meta_out_0(kernel_out_0);
   phi::MetaTensor meta_out_1(kernel_out_1);
@@ -177,8 +146,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> adam_impl(
                                       const phi::DenseTensor&,
                                       const phi::DenseTensor&,
                                       const phi::DenseTensor&,
-                                      paddle::optional<const phi::DenseTensor&>,
-                                      paddle::optional<const phi::DenseTensor&>,
+                                      const paddle::optional<phi::DenseTensor>&,
+                                      const paddle::optional<phi::DenseTensor>&,
                                       const Scalar&,
                                       const Scalar&,
                                       const Scalar&,
@@ -251,8 +220,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> adam_impl(
                                       const phi::DenseTensor&,
                                       const phi::DenseTensor&,
                                       const phi::DenseTensor&,
-                                      paddle::optional<const phi::DenseTensor&>,
-                                      paddle::optional<const phi::DenseTensor&>,
+                                      const paddle::optional<phi::DenseTensor>&,
+                                      const paddle::optional<phi::DenseTensor>&,
                                       const Scalar&,
                                       const Scalar&,
                                       const Scalar&,
@@ -305,8 +274,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> adamw_impl(
     const Tensor& moment2,
     const Tensor& beta1_pow,
     const Tensor& beta2_pow,
-    paddle::optional<const Tensor&> master_param,
-    paddle::optional<const Tensor&> skip_update,
+    const paddle::optional<Tensor>& master_param,
+    const paddle::optional<Tensor>& skip_update,
     const Scalar& beta1,
     const Scalar& beta2,
     const Scalar& epsilon,
@@ -351,11 +320,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> adamw_impl(
   auto input_moment2 = PrepareData(moment2, kernel.InputAt(4), {});
   auto input_beta1_pow = PrepareData(beta1_pow, kernel.InputAt(5), {});
   auto input_beta2_pow = PrepareData(beta2_pow, kernel.InputAt(6), {});
-  paddle::optional<const phi::DenseTensor&> input_master_param(paddle::none);
-  auto input_master_param_ptr =
-      PrepareData(master_param, kernel.InputAt(7), {});
-  paddle::optional<const phi::DenseTensor&> input_skip_update(paddle::none);
-  auto input_skip_update_ptr = PrepareData(skip_update, kernel.InputAt(8), {});
+  auto input_master_param = PrepareData(master_param, kernel.InputAt(7), {});
+  auto input_skip_update = PrepareData(skip_update, kernel.InputAt(8), {});
 
   std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> api_output;
   auto kernel_out_0 = input_param.get();
@@ -364,40 +330,13 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> adamw_impl(
   auto kernel_out_3 = input_beta1_pow.get();
   auto kernel_out_4 = input_beta2_pow.get();
   phi::DenseTensor* kernel_out_5 = nullptr;
-  if (input_master_param_ptr) {
-    input_master_param =
-        paddle::make_optional<const phi::DenseTensor&>(*input_master_param_ptr);
-    kernel_out_5 =
-        paddle::make_optional<phi::DenseTensor&>(*input_master_param_ptr)
-            .get_ptr();
+  if (input_master_param) {
+    kernel_out_5 = input_master_param.get_ptr();
   }
 
-  if (input_skip_update_ptr) {
-    input_skip_update =
-        paddle::make_optional<const phi::DenseTensor&>(*input_skip_update_ptr);
-  }
+  auto input_meta_ref_master_param = MakeMetaTensor(input_master_param);
 
-  paddle::optional<const phi::MetaTensor&> input_meta_ref_master_param(
-      paddle::none);
-  phi::DenseTensor dt;
-  phi::MetaTensor input_meta_tmp_master_param(dt);
-  if (input_master_param_ptr) {
-    input_meta_tmp_master_param.set_dtype(input_master_param_ptr->dtype());
-    input_meta_tmp_master_param.set_dims(input_master_param_ptr->dims());
-    input_meta_tmp_master_param.set_layout(input_master_param_ptr->layout());
-    input_meta_ref_master_param = input_meta_tmp_master_param;
-  }
-
-  paddle::optional<const phi::MetaTensor&> input_meta_ref_skip_update(
-      paddle::none);
-  phi::DenseTensor dt1;
-  phi::MetaTensor input_meta_tmp_skip_update(dt1);
-  if (input_skip_update_ptr) {
-    input_meta_tmp_skip_update.set_dtype(input_skip_update_ptr->dtype());
-    input_meta_tmp_skip_update.set_dims(input_skip_update_ptr->dims());
-    input_meta_tmp_skip_update.set_layout(input_skip_update_ptr->layout());
-    input_meta_ref_skip_update = input_meta_tmp_skip_update;
-  }
+  auto input_meta_ref_skip_update = MakeMetaTensor(input_skip_update);
 
   phi::MetaTensor meta_out_0(kernel_out_0);
   phi::MetaTensor meta_out_1(kernel_out_1);
@@ -440,8 +379,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> adamw_impl(
                                     const phi::DenseTensor&,
                                     const phi::DenseTensor&,
                                     const phi::DenseTensor&,
-                                    paddle::optional<const phi::DenseTensor&>,
-                                    paddle::optional<const phi::DenseTensor&>,
+                                    const paddle::optional<phi::DenseTensor>&,
+                                    const paddle::optional<phi::DenseTensor>&,
                                     const Scalar&,
                                     const Scalar&,
                                     const Scalar&,
@@ -592,6 +531,108 @@ Tensor conv2d_impl(const Tensor& input,
   return api_output;
 }
 
+Tensor conv3d_impl(const Tensor& input,
+                   const Tensor& filter,
+                   const std::vector<int>& strides,
+                   const std::vector<int>& paddings,
+                   const std::string& paddding_algorithm,
+                   int groups,
+                   const std::vector<int>& dilations,
+                   const std::string& data_format,
+                   bool use_addto,
+                   int workspace_size_MB,
+                   bool exhaustive_search) {
+  Backend kernel_backend = Backend::UNDEFINED;
+  DataLayout kernel_layout = DataLayout::UNDEFINED;
+  DataType kernel_data_type = DataType::UNDEFINED;
+
+  kernel_data_type = ParseDataType(input);
+
+  if (kernel_backend == Backend::UNDEFINED ||
+      kernel_layout == DataLayout::UNDEFINED ||
+      kernel_data_type == DataType::UNDEFINED) {
+    auto kernel_key_set = ParseKernelKeyByInputArgs(input, filter);
+    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
+    if (kernel_backend == Backend::UNDEFINED) {
+      kernel_backend = kernel_key.backend();
+    }
+    if (kernel_layout == DataLayout::UNDEFINED) {
+      kernel_layout = kernel_key.layout();
+    }
+    if (kernel_data_type == DataType::UNDEFINED) {
+      kernel_data_type = kernel_key.dtype();
+    }
+  }
+
+  VLOG(6) << "conv3d API kernel key: [" << kernel_backend << ", "
+          << kernel_layout << ", " << kernel_data_type << "]";
+  const auto& kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
+      "conv3d", {kernel_backend, kernel_layout, kernel_data_type}, true);
+  VLOG(6) << "conv3d API kernel: " << kernel;
+
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
+
+  phi::TensorArgDef args0 = kernel.InputAt(0);
+  phi::TensorArgDef args1 = kernel.InputAt(1);
+  if (kernel_backend == Backend::GPU) {
+    args0.backend = Backend::GPU;
+    args1.backend = Backend::GPU;
+  }
+
+  auto input_input = PrepareData(input, args0, {});
+  auto input_filter = PrepareData(filter, args1, {});
+
+  Tensor api_output;
+  auto kernel_out = SetKernelOutput(kernel_backend, &api_output);
+  phi::MetaTensor meta_out(kernel_out);
+
+  phi::ConvInferMeta(MakeMetaTensor(*input_input),
+                     MakeMetaTensor(*input_filter),
+                     strides,
+                     paddings,
+                     paddding_algorithm,
+                     groups,
+                     dilations,
+                     data_format,
+                     use_addto,
+                     workspace_size_MB,
+                     exhaustive_search,
+                     &meta_out);
+
+  using kernel_signature = void (*)(const platform::DeviceContext&,
+                                    const phi::DenseTensor&,
+                                    const phi::DenseTensor&,
+                                    const std::vector<int>&,
+                                    const std::vector<int>&,
+                                    const std::string&,
+                                    int,
+                                    const std::vector<int>&,
+                                    const std::string&,
+                                    bool,
+                                    int,
+                                    bool,
+                                    phi::DenseTensor*);
+  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+
+  {
+    (*kernel_fn)(*dev_ctx,
+                 *input_input,
+                 *input_filter,
+                 strides,
+                 paddings,
+                 paddding_algorithm,
+                 groups,
+                 dilations,
+                 data_format,
+                 use_addto,
+                 workspace_size_MB,
+                 exhaustive_search,
+                 kernel_out);
+  }
+
+  return api_output;
+}
+
 void conv2d_grad_impl(const Tensor& input,
                       const Tensor& filter,
                       const Tensor& out_grad,
@@ -693,12 +734,187 @@ void conv2d_grad_impl(const Tensor& input,
   }
 }
 
+void conv3d_grad_impl(const Tensor& input,
+                      const Tensor& filter,
+                      const Tensor& out_grad,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      const std::string& paddding_algorithm,
+                      int groups,
+                      const std::vector<int>& dilations,
+                      const std::string& data_format,
+                      bool use_addto,
+                      int workspace_size_MB,
+                      bool exhaustive_search,
+                      Tensor* input_grad,
+                      Tensor* filter_grad) {
+  Backend kernel_backend = Backend::UNDEFINED;
+  DataLayout kernel_layout = DataLayout::UNDEFINED;
+  DataType kernel_data_type = DataType::UNDEFINED;
+
+  if (kernel_backend == Backend::UNDEFINED ||
+      kernel_layout == DataLayout::UNDEFINED ||
+      kernel_data_type == DataType::UNDEFINED) {
+    auto kernel_key_set = ParseKernelKeyByInputArgs(input, filter, out_grad);
+    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
+    if (kernel_backend == Backend::UNDEFINED) {
+      kernel_backend = kernel_key.backend();
+    }
+    if (kernel_layout == DataLayout::UNDEFINED) {
+      kernel_layout = kernel_key.layout();
+    }
+    if (kernel_data_type == DataType::UNDEFINED) {
+      kernel_data_type = kernel_key.dtype();
+    }
+  }
+
+  VLOG(6) << "conv3d_grad API kernel key: [" << kernel_backend << ", "
+          << kernel_layout << ", " << kernel_data_type << "]";
+  const auto& kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
+      "conv3d_grad", {kernel_backend, kernel_layout, kernel_data_type}, true);
+  VLOG(6) << "conv3d_grad API kernel: " << kernel;
+
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
+
+  phi::TensorArgDef args0 = kernel.InputAt(0);
+  phi::TensorArgDef args1 = kernel.InputAt(1);
+  phi::TensorArgDef args2 = kernel.InputAt(2);
+  if (kernel_backend == Backend::GPU) {
+    args0.backend = Backend::GPU;
+    args1.backend = Backend::GPU;
+    args2.backend = Backend::GPU;
+  }
+
+  auto input_input = PrepareData(input, args0, {});
+  auto input_filter = PrepareData(filter, args1, {});
+  auto input_out_grad = PrepareData(out_grad, args2, {});
+
+  auto kernel_out_0 = SetKernelOutput(kernel_backend, input_grad);
+  auto kernel_out_1 = SetKernelOutput(kernel_backend, filter_grad);
+  phi::MetaTensor meta_out_0(kernel_out_0);
+  phi::MetaTensor meta_out_1(kernel_out_1);
+
+  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_input),
+                                  MakeMetaTensor(*input_filter),
+                                  kernel_out_0 ? &meta_out_0 : nullptr,
+                                  kernel_out_1 ? &meta_out_1 : nullptr);
+
+  using kernel_signature = void (*)(const platform::DeviceContext&,
+                                    const phi::DenseTensor&,
+                                    const phi::DenseTensor&,
+                                    const phi::DenseTensor&,
+                                    const std::vector<int>&,
+                                    const std::vector<int>&,
+                                    const std::string&,
+                                    int,
+                                    const std::vector<int>&,
+                                    const std::string&,
+                                    bool,
+                                    int,
+                                    bool,
+                                    phi::DenseTensor*,
+                                    phi::DenseTensor*);
+  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+
+  {
+    (*kernel_fn)(*dev_ctx,
+                 *input_input,
+                 *input_filter,
+                 *input_out_grad,
+                 strides,
+                 paddings,
+                 paddding_algorithm,
+                 groups,
+                 dilations,
+                 data_format,
+                 use_addto,
+                 workspace_size_MB,
+                 exhaustive_search,
+                 kernel_out_0,
+                 kernel_out_1);
+  }
+}
+
 Tensor copy_to_impl(const Tensor& x, Place place, bool blocking) {
   Tensor out;
   copy(x, place, blocking, &out);
   return out;
 }
 
+Tensor embedding_impl(const Tensor& x,
+                      const Tensor& weight,
+                      int64_t padding_idx,
+                      bool sparse) {
+  DataType kernel_data_type = ParseDataType(weight);
+  auto kernel_key_set = ParseKernelKeyByInputArgs(weight);
+  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
+  VLOG(6) << "embedding API kernel key: [" << kernel_key.backend() << ", "
+          << kernel_key.layout() << ", " << kernel_data_type << "]";
+
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
+
+  Tensor api_output;
+
+  if (phi::DenseTensor::classof(weight.impl().get())) {
+    const auto& kernel =
+        phi::KernelFactory::Instance().SelectKernelOrThrowError(
+            "embedding",
+            {kernel_key.backend(), kernel_key.layout(), kernel_data_type});
+    VLOG(6) << "embedding API kernel: " << kernel;
+
+    auto input_x = PrepareData(x, kernel.InputAt(0), {});
+    auto input_weight = PrepareData(weight, kernel.InputAt(1), {});
+
+    auto* kernel_out = SetKernelOutput(kernel_key.backend(), &api_output);
+    phi::MetaTensor meta_out(kernel_out);
+
+    phi::EmbeddingInferMeta(MakeMetaTensor(*input_x),
+                            MakeMetaTensor(*input_weight),
+                            padding_idx,
+                            sparse,
+                            &meta_out);
+
+    using kernel_signature = void (*)(const platform::DeviceContext&,
+                                      const phi::DenseTensor&,
+                                      const phi::DenseTensor&,
+                                      int64_t,
+                                      phi::DenseTensor*);
+    auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+    {
+      (*kernel_fn)(*dev_ctx, *input_x, *input_weight, padding_idx, kernel_out);
+    }
+  } else {
+    const auto& kernel =
+        phi::KernelFactory::Instance().SelectKernelOrThrowError(
+            "sparse_weight_embedding",
+            {kernel_key.backend(), kernel_key.layout(), kernel_data_type});
+    VLOG(6) << "sparse_weight_embedding API kernel: " << kernel;
+
+    auto input_x = PrepareData(x, kernel.InputAt(0), {});
+    auto input_weight = TensorToSelectedRows(weight);
+
+    auto* kernel_out = SetKernelOutput(kernel_key.backend(), &api_output);
+    phi::MetaTensor meta_out(kernel_out);
+
+    phi::EmbeddingInferMeta(MakeMetaTensor(*input_x),
+                            MakeMetaTensor(*input_weight),
+                            padding_idx,
+                            sparse,
+                            &meta_out);
+
+    using kernel_signature = void (*)(const platform::DeviceContext&,
+                                      const phi::DenseTensor&,
+                                      const phi::SelectedRows&,
+                                      int64_t,
+                                      phi::DenseTensor*);
+    auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+    {
+      (*kernel_fn)(*dev_ctx, *input_x, *input_weight, padding_idx, kernel_out);
+    }
+  }
+  return api_output;
+}
+
 std::vector<Tensor> split_impl(const Tensor& x,
                                const IntArray& num_or_sections,
                                const Scalar& axis) {
@@ -761,7 +977,7 @@ std::tuple<Tensor, Tensor, Tensor> momentum_impl(
     const Tensor& grad,
     const Tensor& velocity,
     const Tensor& learning_rate,
-    paddle::optional<const Tensor&> master_param,
+    const paddle::optional<Tensor>& master_param,
     float mu,
     bool use_nesterov,
     const std::string& regularization_method,
@@ -802,32 +1018,18 @@ std::tuple<Tensor, Tensor, Tensor> momentum_impl(
   auto input_grad = PrepareData(grad, kernel.InputAt(1), {});
   auto input_velocity = PrepareData(velocity, kernel.InputAt(2), {});
   auto input_learning_rate = PrepareData(learning_rate, kernel.InputAt(3), {});
-  paddle::optional<const phi::DenseTensor&> input_master_param(paddle::none);
-  auto input_master_param_ptr =
-      PrepareData(master_param, kernel.InputAt(4), {});
+  auto input_master_param = PrepareData(master_param, kernel.InputAt(4), {});
 
   std::tuple<Tensor, Tensor, Tensor> api_output;
   auto kernel_out_0 = input_param.get();
   auto kernel_out_1 = input_velocity.get();
   phi::DenseTensor* kernel_out_2 = nullptr;
-  if (input_master_param_ptr) {
-    input_master_param =
-        paddle::make_optional<const phi::DenseTensor&>(*input_master_param_ptr);
-    kernel_out_2 =
-        paddle::make_optional<phi::DenseTensor&>(*input_master_param_ptr)
-            .get_ptr();
+  if (input_master_param) {
+    kernel_out_2 = input_master_param.get_ptr();
   }
 
-  paddle::optional<const phi::MetaTensor&> input_meta_ref_master_param(
-      paddle::none);
-  phi::DenseTensor dt;
-  phi::MetaTensor input_meta_tmp_master_param(dt);
-  if (input_master_param_ptr) {
-    input_meta_tmp_master_param.set_dtype(input_master_param_ptr->dtype());
-    input_meta_tmp_master_param.set_dims(input_master_param_ptr->dims());
-    input_meta_tmp_master_param.set_layout(input_master_param_ptr->layout());
-    input_meta_ref_master_param = input_meta_tmp_master_param;
-  }
+  auto input_meta_ref_master_param = MakeMetaTensor(input_master_param);
+
   phi::MetaTensor meta_out_0(kernel_out_0);
   phi::MetaTensor meta_out_1(kernel_out_1);
   if (kernel_out_2) {
@@ -868,7 +1070,7 @@ std::tuple<Tensor, Tensor, Tensor> momentum_impl(
                                     const phi::DenseTensor&,
                                     const phi::DenseTensor&,
                                     const phi::DenseTensor&,
-                                    paddle::optional<const phi::DenseTensor&>,
+                                    const paddle::optional<phi::DenseTensor>&,
                                     float,
                                     bool,
                                     const std::string&,
@@ -903,7 +1105,7 @@ std::tuple<Tensor, Tensor> sgd_impl(
     const Tensor& param,
     const Tensor& learning_rate,
     const Tensor& grad,
-    paddle::optional<const Tensor&> master_param,
+    const paddle::optional<Tensor>& master_param,
     bool multi_precision) {
   DataType kernel_data_type = ParseDataType(param);
   auto kernel_key_set = ParseKernelKeyByInputArgs(param, learning_rate, grad);
@@ -941,17 +1143,8 @@ std::tuple<Tensor, Tensor> sgd_impl(
 
   if (phi::DenseTensor::classof(param_tensor.get())) {
     auto in_param = PrepareData(param, kernel.InputAt(0), {});
-    auto in_master_param = PrepareData(master_param, kernel.InputAt(3), {});
-
-    paddle::optional<const phi::DenseTensor&> in_master_param_opt =
-        master_param
-            ? paddle::make_optional<const phi::DenseTensor&>(*in_master_param)
-            : paddle::none;
-    auto master_param_meta = MakeMetaTensor(in_master_param_opt);
-    paddle::optional<const phi::MetaTensor&> master_param_meta_opt =
-        master_param
-            ? paddle::make_optional<const phi::MetaTensor&>(*master_param_meta)
-            : paddle::none;
+    auto in_master_param_opt = PrepareData(master_param, kernel.InputAt(3), {});
+    auto master_param_meta_opt = MakeMetaTensor(in_master_param_opt);
 
     phi::DenseTensor* kernel_out_0 =
         SetKernelOutput(kernel_key.backend(), &std::get<0>(out));
@@ -975,7 +1168,7 @@ std::tuple<Tensor, Tensor> sgd_impl(
                    const phi::DenseTensor&,
                    const phi::DenseTensor&,
                    const phi::DenseTensor&,
-                   paddle::optional<const phi::DenseTensor&>,
+                   const paddle::optional<phi::DenseTensor>&,
                    bool,
                    phi::DenseTensor*,
                    phi::DenseTensor*);
@@ -1004,7 +1197,7 @@ std::tuple<Tensor, Tensor> sgd_impl(
                    const phi::DenseTensor&,
                    const phi::DenseTensor&,
                    const phi::SelectedRows&,
-                   paddle::optional<const phi::DenseTensor&>,
+                   const paddle::optional<phi::DenseTensor>&,
                    bool,
                    phi::DenseTensor*,
                    phi::DenseTensor*);
@@ -1021,16 +1214,8 @@ std::tuple<Tensor, Tensor> sgd_impl(
   } else {
     auto in_param = TensorToSelectedRows(param);
     auto in_grad = TensorToSelectedRows(grad);
-    auto in_master_param = TensorToSelectedRows(master_param);
-    auto in_master_param_opt =
-        master_param
-            ? paddle::make_optional<const phi::SelectedRows&>(*in_master_param)
-            : paddle::none;
+    auto in_master_param_opt = TensorToSelectedRows(master_param);
     auto master_param_meta = MakeMetaTensor(in_master_param_opt);
-    paddle::optional<const phi::MetaTensor&> master_param_meta_opt =
-        master_param
-            ? paddle::make_optional<const phi::MetaTensor&>(*master_param_meta)
-            : paddle::none;
 
     phi::SelectedRows* kernel_out_0 =
         SetSelectedRowsKernelOutput(kernel_key.backend(), &std::get<0>(out));
@@ -1042,7 +1227,7 @@ std::tuple<Tensor, Tensor> sgd_impl(
     SgdInferMeta(MakeMetaTensor(*in_param),
                  MakeMetaTensor(*in_learning_rate),
                  MakeMetaTensor(*in_grad),
-                 master_param_meta_opt,
+                 master_param_meta,
                  multi_precision,
                  &meta_out_0,
                  &meta_out_1);
@@ -1052,7 +1237,7 @@ std::tuple<Tensor, Tensor> sgd_impl(
                  const phi::SelectedRows&,
                  const phi::DenseTensor&,
                  const phi::SelectedRows&,
-                 paddle::optional<const phi::SelectedRows&>,
+                 const paddle::optional<phi::SelectedRows>&,
                  bool,
                  phi::SelectedRows*,
                  phi::SelectedRows*);
@@ -1268,6 +1453,125 @@ void imag_grad_impl(const Tensor& out_grad, Tensor* x_grad) {
   (*kernel_fn)(*dev_ctx, *dense_out_grad, kernel_out);
 }
 
+void embedding_grad_impl(const Tensor& x,
+                         const Tensor& weight,
+                         const Tensor& out_grad,
+                         int64_t padding_idx,
+                         bool sparse,
+                         Tensor* weight_grad) {
+  DataType kernel_data_type = ParseDataType(weight);
+  auto kernel_key_set = ParseKernelKeyByInputArgs(weight);
+  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
+  VLOG(6) << "embedding_grad API kernel key: [" << kernel_key.backend() << ", "
+          << kernel_key.layout() << ", " << kernel_data_type << "]";
+
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
+
+  if (phi::DenseTensor::classof(weight.impl().get())) {
+    std::string kernel_name =
+        sparse ? "embedding_sparse_grad" : "embedding_grad";
+    const auto& kernel =
+        phi::KernelFactory::Instance().SelectKernelOrThrowError(
+            kernel_name,
+            {kernel_key.backend(), kernel_key.layout(), kernel_data_type});
+    VLOG(6) << kernel_name << " API kernel: " << kernel;
+
+    auto input_x = PrepareData(x, kernel.InputAt(0), {});
+    auto input_weight = PrepareData(weight, kernel.InputAt(1), {});
+    auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
+
+    if (sparse) {
+      auto* kernel_out =
+          SetSelectedRowsKernelOutput(kernel_key.backend(), weight_grad);
+      phi::MetaTensor meta_out(kernel_out);
+      meta_out.set_dims(input_weight->dims());
+      meta_out.set_dtype(input_weight->dtype());
+      kernel_out->set_height(input_weight->dims()[0]);
+
+      using kernel_signature = void (*)(const platform::DeviceContext&,
+                                        const phi::DenseTensor&,
+                                        const phi::DenseTensor&,
+                                        const phi::DenseTensor&,
+                                        int64_t,
+                                        phi::SelectedRows*);
+      auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+      (*kernel_fn)(*dev_ctx,
+                   *input_x,
+                   *input_weight,
+                   *input_out_grad,
+                   padding_idx,
+                   kernel_out);
+    } else {
+      auto* kernel_out = SetKernelOutput(kernel_key.backend(), weight_grad);
+      phi::MetaTensor meta_out(kernel_out);
+      phi::UnchangedInferMeta(MakeMetaTensor(*input_weight), &meta_out);
+      using kernel_signature = void (*)(const platform::DeviceContext&,
+                                        const phi::DenseTensor&,
+                                        const phi::DenseTensor&,
+                                        const phi::DenseTensor&,
+                                        int64_t,
+                                        phi::DenseTensor*);
+      auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+      (*kernel_fn)(*dev_ctx,
+                   *input_x,
+                   *input_weight,
+                   *input_out_grad,
+                   padding_idx,
+                   kernel_out);
+    }
+  } else {
+    std::string kernel_name = sparse ? "sparse_weight_embedding_sparse_grad"
+                                     : "sparse_weight_embedding_grad";
+    const auto& kernel =
+        phi::KernelFactory::Instance().SelectKernelOrThrowError(
+            kernel_name,
+            {kernel_key.backend(), kernel_key.layout(), kernel_data_type});
+    VLOG(6) << kernel_name << " API kernel: " << kernel;
+
+    auto input_x = PrepareData(x, kernel.InputAt(0), {});
+    auto input_weight = TensorToSelectedRows(weight);
+    auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
+
+    if (sparse) {
+      auto* kernel_out =
+          SetSelectedRowsKernelOutput(kernel_key.backend(), weight_grad);
+      phi::MetaTensor meta_out(kernel_out);
+      phi::UnchangedInferMeta(MakeMetaTensor(*input_weight), &meta_out);
+      using kernel_signature = void (*)(const platform::DeviceContext&,
+                                        const phi::DenseTensor&,
+                                        const phi::SelectedRows&,
+                                        const phi::DenseTensor&,
+                                        int64_t,
+                                        phi::SelectedRows*);
+      auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+      (*kernel_fn)(*dev_ctx,
+                   *input_x,
+                   *input_weight,
+                   *input_out_grad,
+                   padding_idx,
+                   kernel_out);
+    } else {
+      auto* kernel_out = SetKernelOutput(kernel_key.backend(), weight_grad);
+      phi::MetaTensor meta_out(kernel_out);
+      meta_out.set_dims(input_weight->GetCompleteDims());
+      meta_out.set_dtype(input_weight->dtype());
+      using kernel_signature = void (*)(const platform::DeviceContext&,
+                                        const phi::DenseTensor&,
+                                        const phi::SelectedRows&,
+                                        const phi::DenseTensor&,
+                                        int64_t,
+                                        phi::DenseTensor*);
+      auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+      (*kernel_fn)(*dev_ctx,
+                   *input_x,
+                   *input_weight,
+                   *input_out_grad,
+                   padding_idx,
+                   kernel_out);
+    }
+  }
+}
+
 void real_grad_impl(const Tensor& out_grad, Tensor* x_grad) {
   phi::KernelKey kernel_key{ParseBackend(out_grad),
                             out_grad.layout(),
diff --git a/paddle/phi/api/lib/api_custom_impl.h b/paddle/phi/api/lib/api_custom_impl.h
index d88a134654caf..22c5d193a2bcd 100644
--- a/paddle/phi/api/lib/api_custom_impl.h
+++ b/paddle/phi/api/lib/api_custom_impl.h
@@ -39,8 +39,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> adam_impl(
     const Tensor& moment2,
     const Tensor& beta1_pow,
     const Tensor& beta2_pow,
-    paddle::optional<const Tensor&> master_param,
-    paddle::optional<const Tensor&> skip_update,
+    const paddle::optional<Tensor>& master_param,
+    const paddle::optional<Tensor>& skip_update,
     const Scalar& beta1,
     const Scalar& beta2,
     const Scalar& epsilon,
@@ -57,8 +57,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> adamw_impl(
     const Tensor& moment2,
     const Tensor& beta1_pow,
     const Tensor& beta2_pow,
-    paddle::optional<const Tensor&> master_param,
-    paddle::optional<const Tensor&> skip_update,
+    const paddle::optional<Tensor>& master_param,
+    const paddle::optional<Tensor>& skip_update,
     const Scalar& beta1,
     const Scalar& beta2,
     const Scalar& epsilon,
@@ -96,8 +96,25 @@ Tensor conv2d_impl(const Tensor& input,
                    int workspace_size_MB,
                    bool exhaustive_search);
 
+Tensor conv3d_impl(const Tensor& input,
+                   const Tensor& filter,
+                   const std::vector<int>& strides,
+                   const std::vector<int>& paddings,
+                   const std::string& paddding_algorithm,
+                   int groups,
+                   const std::vector<int>& dilations,
+                   const std::string& data_format,
+                   bool use_addto,
+                   int workspace_size_MB,
+                   bool exhaustive_search);
+
 Tensor copy_to_impl(const Tensor& x, Place place, bool blocking);
 
+Tensor embedding_impl(const Tensor& x,
+                      const Tensor& weight,
+                      int64_t padding_idx,
+                      bool sparse);
+
 std::vector<Tensor> split_impl(const Tensor& x,
                                const IntArray& num_or_sections,
                                const Scalar& axis);
@@ -107,7 +124,7 @@ std::tuple<Tensor, Tensor, Tensor> momentum_impl(
     const Tensor& grad,
     const Tensor& velocity,
     const Tensor& learning_rate,
-    paddle::optional<const Tensor&> master_param,
+    const paddle::optional<Tensor>& master_param,
     float mu,
     bool use_nesterov,
     const std::string& regularization_method,
@@ -119,7 +136,7 @@ std::tuple<Tensor, Tensor> sgd_impl(
     const Tensor& param,
     const Tensor& learning_rate,
     const Tensor& grad,
-    paddle::optional<const Tensor&> master_param,
+    const paddle::optional<Tensor>& master_param,
     bool multi_precision);
 
 ////////////////// Backward(grad) api impls //////////////////////
@@ -143,8 +160,30 @@ void conv2d_grad_impl(const Tensor& input,
                       Tensor* input_grad,
                       Tensor* filter_grad);
 
+void conv3d_grad_impl(const Tensor& input,
+                      const Tensor& filter,
+                      const Tensor& out_grad,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      const std::string& paddding_algorithm,
+                      int groups,
+                      const std::vector<int>& dilations,
+                      const std::string& data_format,
+                      bool use_addto,
+                      int workspace_size_MB,
+                      bool exhaustive_search,
+                      Tensor* input_grad,
+                      Tensor* filter_grad);
+
 void imag_grad_impl(const Tensor& out_grad, Tensor* x_grad);
 
+void embedding_grad_impl(const Tensor& x,
+                         const Tensor& weight,
+                         const Tensor& out_grad,
+                         int64_t padding_idx,
+                         bool sparse,
+                         Tensor* weight_grad);
+
 void real_grad_impl(const Tensor& out_grad, Tensor* x_grad);
 
 }  // namespace experimental
diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc
index 2111829b8d60b..633bb1a32a133 100644
--- a/paddle/phi/api/lib/api_gen_utils.cc
+++ b/paddle/phi/api/lib/api_gen_utils.cc
@@ -23,10 +23,10 @@ std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(const Tensor& tensor) {
   return std::static_pointer_cast<phi::DenseTensor>(tensor.impl());
 }
 
-std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
-    const paddle::optional<const Tensor&>& tensor) {
+paddle::optional<phi::DenseTensor> TensorToDenseTensor(
+    const paddle::optional<Tensor>& tensor) {
   if (tensor) {
-    return std::static_pointer_cast<phi::DenseTensor>(tensor->impl());
+    return {*std::static_pointer_cast<phi::DenseTensor>(tensor->impl())};
   }
   return nullptr;
 }
@@ -48,10 +48,10 @@ std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(const Tensor& tensor) {
   return std::static_pointer_cast<phi::SelectedRows>(tensor.impl());
 }
 
-std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
-    const paddle::optional<const Tensor&>& tensor) {
+paddle::optional<phi::SelectedRows> TensorToSelectedRows(
+    const paddle::optional<Tensor>& tensor) {
   if (tensor) {
-    return std::static_pointer_cast<phi::SelectedRows>(tensor->impl());
+    return {*std::static_pointer_cast<phi::SelectedRows>(tensor->impl())};
   }
   return nullptr;
 }
@@ -66,12 +66,12 @@ phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor) {
   return phi::MetaTensor(tensor);
 }
 
-paddle::optional<phi::MetaTensor> MakeMetaTensor(
-    const paddle::optional<const phi::DenseTensor&>& tensor) {
+phi::MetaTensor MakeMetaTensor(
+    const paddle::optional<phi::DenseTensor>& tensor) {
   if (tensor) {
     return {phi::MetaTensor(*tensor)};
   }
-  return {paddle::none};
+  return phi::MetaTensor();
 }
 
 std::vector<phi::MetaTensor> MakeMetaTensor(
@@ -98,12 +98,12 @@ phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor) {
   return phi::MetaTensor(tensor);
 }
 
-paddle::optional<phi::MetaTensor> MakeMetaTensor(
-    const paddle::optional<const phi::SelectedRows&>& tensor) {
+phi::MetaTensor MakeMetaTensor(
+    const paddle::optional<phi::SelectedRows>& tensor) {
   if (tensor) {
     return {phi::MetaTensor(*tensor)};
   }
-  return {paddle::none};
+  return phi::MetaTensor();
 }
 
 phi::MetaTensor MakeMetaTensor(const phi::StringTensor& tensor) {
diff --git a/paddle/phi/api/lib/api_gen_utils.h b/paddle/phi/api/lib/api_gen_utils.h
index 7303e6b46114d..83656a7b528a6 100644
--- a/paddle/phi/api/lib/api_gen_utils.h
+++ b/paddle/phi/api/lib/api_gen_utils.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/phi/api/include/tensor.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/meta_tensor.h"
@@ -33,7 +32,7 @@ enum class TensorType { DENSE_TENSOR, SPARSE_CSR, SPARSE_COO, STRING_TENSOR };
 
 std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(const Tensor& tensor);
 
-std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
+paddle::optional<phi::DenseTensor> TensorToDenseTensor(
     const paddle::optional<Tensor>& tensor);
 
 std::unique_ptr<std::vector<phi::DenseTensor>> TensorToDenseTensor(
@@ -41,8 +40,8 @@ std::unique_ptr<std::vector<phi::DenseTensor>> TensorToDenseTensor(
 
 std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(const Tensor& tensor);
 
-std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
-    const paddle::optional<const Tensor&>& tensor);
+paddle::optional<phi::SelectedRows> TensorToSelectedRows(
+    const paddle::optional<Tensor>& tensor);
 
 std::shared_ptr<phi::StringTensor> TensorToStringTensor(const Tensor& tensor);
 
@@ -50,8 +49,8 @@ std::shared_ptr<phi::StringTensor> TensorToStringTensor(const Tensor& tensor);
 
 phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor);
 
-paddle::optional<phi::MetaTensor> MakeMetaTensor(
-    const paddle::optional<const phi::DenseTensor&>& tensor);
+phi::MetaTensor MakeMetaTensor(
+    const paddle::optional<phi::DenseTensor>& tensor);
 
 std::vector<phi::MetaTensor> MakeMetaTensor(
     const std::vector<const phi::DenseTensor*>& tensors);
@@ -61,8 +60,8 @@ std::vector<phi::MetaTensor> MakeMetaTensor(
 
 phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor);
 
-paddle::optional<phi::MetaTensor> MakeMetaTensor(
-    const paddle::optional<const phi::SelectedRows&>& tensor);
+phi::MetaTensor MakeMetaTensor(
+    const paddle::optional<phi::SelectedRows>& tensor);
 
 phi::MetaTensor MakeMetaTensor(const phi::StringTensor& tensor);
 
diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc
index b00311061c9d0..12f7b8bba5870 100644
--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -174,20 +174,6 @@ inline phi::DenseTensor TransDataPlace(const phi::DenseTensor& tensor,
   if (!platform::is_cuda_pinned_place(tensor.place())) {
     pool.Get(tensor.place())->Wait();
     pool.Get(dst_place)->Wait();
-  } else if (platform::is_gpu_place(dst_place)) {
-    auto* dev_ctx = static_cast<phi::GPUContext*>(pool.Get(dst_place));
-    phi::Copy(*dev_ctx, tensor, dst_place, false, &out);
-
-    // Note: This is an empty callback, the only way is to "reference"
-    // tensor, so it will not be destructed until the kernels launched at
-    // current
-    // stream of given place is finished.
-    auto callback = [tensor, dst_place]() {
-      VLOG(4) << "Run callback of tensor:" << &tensor << " at place "
-              << dst_place;
-    };
-    dev_ctx->AddStreamCallback(callback);
-    return out;
   }
 #endif
 
@@ -204,23 +190,31 @@ inline phi::DenseTensor TransDataPlace(const phi::DenseTensor& tensor,
   return out;
 }
 
-phi::DenseTensor TransformData(const phi::DenseTensor& tensor,
+phi::DenseTensor TransformData(phi::DenseTensor* tensor,
                                const phi::TensorArgDef& target_args_def,
                                const TransformFlag& transform_flag) {
-  phi::DenseTensor out = tensor;
+  phi::DenseTensor out = *tensor;
+  bool trans_layout = false;
+  bool trans_dtype = false;
   if (NeedTransformLayout(
-          tensor.layout(), target_args_def.layout, transform_flag)) {
+          tensor->layout(), target_args_def.layout, transform_flag)) {
     out = TransDataLayout(out, target_args_def.layout);
+    trans_layout = true;
   }
 
   if (NeedTransformDataType(
-          tensor.dtype(), target_args_def.dtype, transform_flag)) {
+          tensor->dtype(), target_args_def.dtype, transform_flag)) {
     out = TransDataType(out, target_args_def.dtype);
+    trans_dtype = true;
   }
 
   if (NeedTransformPlace(
           out.place(), target_args_def.backend, transform_flag)) {
     out = TransDataPlace(out, phi::TransToPhiPlace(target_args_def.backend));
+    if (!trans_layout && !trans_dtype &&
+        tensor->place().GetType() == AllocationType::GPUPINNED) {
+      tensor->ShareBufferWith(out);
+    }
   }
   return out;
 }
@@ -243,31 +237,20 @@ std::shared_ptr<phi::DenseTensor> PrepareData(
       return std::static_pointer_cast<phi::DenseTensor>(tensor_in);
     }
     phi::DenseTensor out =
-        TransformData(dense_tensor, target_args_def, transform_flag);
+        TransformData(&dense_tensor, target_args_def, transform_flag);
     return std::make_shared<phi::DenseTensor>(std::move(out));
   }
   return nullptr;
 }
 
-std::shared_ptr<phi::DenseTensor> PrepareData(
+paddle::optional<phi::DenseTensor> PrepareData(
     const paddle::optional<Tensor>& input,
     const phi::TensorArgDef& target_args_def,
     const TransformFlag& transform_flag) {
   if (input) {
-    return PrepareData(*input, target_args_def, transform_flag);
-  }
-  return {nullptr};
-}
-
-std::shared_ptr<phi::DenseTensor> PrepareData(
-    const paddle::optional<const Tensor&> input,
-    const phi::TensorArgDef& target_args_def,
-    const TransformFlag& transform_flag) {
-  if (input.get_ptr() != nullptr) {
-    return PrepareData(*(input.get_ptr()), target_args_def, transform_flag);
+    return {*PrepareData(*input, target_args_def, transform_flag)};
   }
-
-  return {nullptr};
+  return paddle::none;
 }
 
 std::unique_ptr<std::vector<phi::DenseTensor>> PrepareData(
@@ -290,7 +273,7 @@ std::unique_ptr<std::vector<phi::DenseTensor>> PrepareData(
           *std::dynamic_pointer_cast<phi::DenseTensor>(tensor_in));
     } else {
       pt_tensors->emplace_back(
-          TransformData(*(static_cast<phi::DenseTensor*>(tensor_in.get())),
+          TransformData((static_cast<phi::DenseTensor*>(tensor_in.get())),
                         target_args_def,
                         transform_flag));
     }
diff --git a/paddle/phi/api/lib/data_transform.h b/paddle/phi/api/lib/data_transform.h
index f5537961d0ac7..4d70078ef3444 100644
--- a/paddle/phi/api/lib/data_transform.h
+++ b/paddle/phi/api/lib/data_transform.h
@@ -66,7 +66,7 @@ std::shared_ptr<phi::DenseTensor> PrepareData(
     const phi::TensorArgDef& target_args_def,
     const TransformFlag& transform_flag);
 
-std::shared_ptr<phi::DenseTensor> PrepareData(
+paddle::optional<phi::DenseTensor> PrepareData(
     const paddle::optional<Tensor>& input,
     const phi::TensorArgDef& target_args_def,
     const TransformFlag& transform_flag);
@@ -76,10 +76,5 @@ std::unique_ptr<std::vector<phi::DenseTensor>> PrepareData(
     const phi::TensorArgDef& target_args_def,
     const TransformFlag& transform_flag);
 
-std::shared_ptr<phi::DenseTensor> PrepareData(
-    const paddle::optional<const Tensor&> input,
-    const phi::TensorArgDef& target_args_def,
-    const TransformFlag& transform_flag);
-
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/phi/api/lib/kernel_dispatch.h b/paddle/phi/api/lib/kernel_dispatch.h
index 29254a0486d00..1091e0556da8b 100644
--- a/paddle/phi/api/lib/kernel_dispatch.h
+++ b/paddle/phi/api/lib/kernel_dispatch.h
@@ -125,8 +125,8 @@ struct KernelKeyParser : ArgsIterator<KernelKeyParser> {
     key_set.dtype = tensor.dtype();
   }
 
-  void operator()(const paddle::optional<const Tensor&> x) {
-    if (x.get_ptr() != nullptr) {
+  void operator()(const paddle::optional<Tensor>& x) {
+    if (x) {
       const phi::TensorBase& tensor = *(x.get_ptr()->impl());
       AssignKernelKeySet(tensor);
     }
diff --git a/paddle/phi/api/lib/op_meta_info.cc b/paddle/phi/api/lib/op_meta_info.cc
index 048e4f2b428f2..8d64246bdb69f 100644
--- a/paddle/phi/api/lib/op_meta_info.cc
+++ b/paddle/phi/api/lib/op_meta_info.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
+#include "glog/logging.h"
 #include "paddle/fluid/framework/custom_operator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
diff --git a/paddle/phi/api/lib/sparse_api_custom_impl.cc b/paddle/phi/api/lib/sparse_api_custom_impl.cc
index c88e2e367feed..71ba8eaae2d36 100644
--- a/paddle/phi/api/lib/sparse_api_custom_impl.cc
+++ b/paddle/phi/api/lib/sparse_api_custom_impl.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <memory>
 #include "glog/logging.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace paddle {
diff --git a/paddle/phi/api/lib/tensor_copy.cc b/paddle/phi/api/lib/tensor_copy.cc
index 57e3c28d8cb1f..85de3601fd96a 100644
--- a/paddle/phi/api/lib/tensor_copy.cc
+++ b/paddle/phi/api/lib/tensor_copy.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/phi/api/lib/tensor_copy.h"
 #include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/meta_tensor.h"
diff --git a/paddle/phi/api/lib/utils/CMakeLists.txt b/paddle/phi/api/lib/utils/CMakeLists.txt
index 5689b2d43a4f2..0e1cd0cb83fd4 100644
--- a/paddle/phi/api/lib/utils/CMakeLists.txt
+++ b/paddle/phi/api/lib/utils/CMakeLists.txt
@@ -1,2 +1,2 @@
-cc_library(phi_api_utils SRCS storage.cc tensor_utils.cc DEPS
+cc_library(phi_api_utils SRCS tensor_utils.cc DEPS
 tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits string_tensor int_array scalar)
diff --git a/paddle/phi/api/lib/utils/allocator.h b/paddle/phi/api/lib/utils/allocator.h
index 84a089e5899ec..96f1294102ae1 100644
--- a/paddle/phi/api/lib/utils/allocator.h
+++ b/paddle/phi/api/lib/utils/allocator.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/phi/core/allocator.h"
-#include "paddle/phi/core/storage.h"
 
 namespace paddle {
 namespace experimental {
diff --git a/paddle/phi/api/lib/utils/storage.cc b/paddle/phi/api/lib/utils/storage.cc
deleted file mode 100644
index 09ff18d10e312..0000000000000
--- a/paddle/phi/api/lib/utils/storage.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/api/lib/utils/storage.h"
-
-namespace paddle {
-namespace experimental {
-
-ExternalStorage::ExternalStorage(void* ptr,
-                                 size_t size,
-                                 const phi::Place& place)
-    : phi::Storage(std::make_shared<phi::Allocation>(ptr, size, place)),
-      size_(size) {}
-
-ExternalStorage::ExternalStorage(const phi::intrusive_ptr<phi::Storage>& root,
-                                 size_t delta,
-                                 size_t size)
-    : Storage(std::make_shared<phi::Allocation>(
-          static_cast<uint8_t*>(root->data()) + delta, size, root->place())),
-      size_(size) {
-  PADDLE_ENFORCE_LE(
-      static_cast<size_t>(delta + size),
-      root->size(),
-      phi::errors::InvalidArgument("The size of the external storage does "
-                                   "not meet the metadata requirements."));
-}
-
-}  // namespace experimental
-}  // namespace paddle
diff --git a/paddle/phi/api/lib/utils/storage.h b/paddle/phi/api/lib/utils/storage.h
deleted file mode 100644
index 5fe17bc51b68a..0000000000000
--- a/paddle/phi/api/lib/utils/storage.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/phi/core/storage.h"
-
-namespace paddle {
-namespace experimental {
-
-class ExternalStorage : public phi::Storage {
- public:
-  ExternalStorage(void* ptr, size_t size, const phi::Place& place);
-  ExternalStorage(const phi::intrusive_ptr<phi::Storage>& root,
-                  size_t delta,
-                  size_t size);
-
-  static const char* name() { return "ExternalStorage"; }
-
-  void Realloc(size_t n) override {
-    PADDLE_THROW(phi::errors::Unavailable(
-        "The external shared storage cannot be reallocated."));
-  }
-
-  void Clear() override {
-    data_ = nullptr;
-    size_ = 0;
-  }
-
-  void set_data_shared(
-      const std::shared_ptr<paddle::memory::Allocation>& holder) override {
-    CHECK(holder);
-    data_ = holder;
-    size_ = holder->size();
-  }
-
-  std::shared_ptr<paddle::memory::Allocation>&& move_data_shared() override {
-    size_ = 0;
-    return std::move(data_);
-  }
-
-  size_t size() const noexcept override { return size_; }
-  const phi::Place& place() const override {
-    PADDLE_ENFORCE_NOT_NULL(
-        data_,
-        phi::errors::Unavailable(
-            "Unable to visit place as data_ has not been initialized yet."));
-    return data_->place();
-  }
-  bool OwnsMemory() const noexcept override { return false; }
-
- private:
-  int64_t size_{0};
-};
-
-class TensorStorage : public paddle::memory::allocation::Allocation {
- public:
-  explicit TensorStorage(phi::intrusive_ptr<phi::Storage> storage)
-      : paddle::memory::allocation::Allocation(
-            storage->data(), storage->size(), storage->place()),
-        storage_(std::move(storage)) {}
-
- private:
-  phi::intrusive_ptr<phi::Storage> storage_;
-};
-
-}  // namespace experimental
-}  // namespace paddle
diff --git a/paddle/phi/api/lib/utils/tensor_utils.h b/paddle/phi/api/lib/utils/tensor_utils.h
index 00199da1280e8..36a0901bbe980 100644
--- a/paddle/phi/api/lib/utils/tensor_utils.h
+++ b/paddle/phi/api/lib/utils/tensor_utils.h
@@ -20,7 +20,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/variable.h"
 
 #include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/compat/convert_utils.h"
diff --git a/paddle/phi/core/dense_tensor.h b/paddle/phi/core/dense_tensor.h
index 9861bd68e4a9e..06d3e435bc110 100644
--- a/paddle/phi/core/dense_tensor.h
+++ b/paddle/phi/core/dense_tensor.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/phi/core/allocator.h"
-#include "paddle/phi/core/storage.h"
 #include "paddle/phi/core/stream.h"
 #include "paddle/phi/core/tensor_base.h"
 #include "paddle/phi/core/tensor_meta.h"
diff --git a/paddle/phi/core/dense_tensor.inl b/paddle/phi/core/dense_tensor.inl
index 93513067a268b..01c19e8a55fdf 100644
--- a/paddle/phi/core/dense_tensor.inl
+++ b/paddle/phi/core/dense_tensor.inl
@@ -26,18 +26,6 @@ public:
 */
 explicit DenseTensor(paddle::experimental::DataType dtype);
 
-/// \brief Use existing storage space to create dense tensor. This interface
-/// can be used to deliberately create an uninitialized dense tensor.
-/// \param storage The existing storage.
-/// \param meta The meta data of dense tensor.
-DenseTensor(intrusive_ptr<Storage> storage, const DenseTensorMeta& meta);
-
-/// \brief Use existing storage space to create dense tensor. This interface
-/// can be used to deliberately create an uninitialized dense tensor.
-/// \param storage The existing storage.
-/// \param meta The meta data of dense tensor.
-DenseTensor(intrusive_ptr<Storage> storage, DenseTensorMeta&& meta);
-
 inline bool IsInitialized() const { return holder_ != nullptr; }
 
 template <typename T>
diff --git a/paddle/phi/core/dense_tensor_impl.cc b/paddle/phi/core/dense_tensor_impl.cc
index 3c030cac2e7c9..8c97b6bf223fb 100644
--- a/paddle/phi/core/dense_tensor_impl.cc
+++ b/paddle/phi/core/dense_tensor_impl.cc
@@ -18,9 +18,10 @@ limitations under the License. */
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
 
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 
+#include "paddle/fluid/memory/malloc.h"
+
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_utils.h"
 #endif
@@ -211,13 +212,6 @@ LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::phi::dtype::complex<double>)
 /*   From framework::LoDTensor    */
 /* ------------------------------ */
 
-DenseTensor::DenseTensor(intrusive_ptr<Storage> storage,
-                         const DenseTensorMeta& meta)
-    : meta_(meta), holder_(storage->move_data_shared()) {}
-
-DenseTensor::DenseTensor(intrusive_ptr<Storage> storage, DenseTensorMeta&& meta)
-    : meta_(std::move(meta)), holder_(storage->move_data_shared()) {}
-
 DenseTensor::DenseTensor(const LoD& lod) : DenseTensor() { meta_.lod = lod; }
 
 void DenseTensor::set_lod(const LoD& lod) { meta_.lod = lod; }
diff --git a/paddle/phi/core/generator.h b/paddle/phi/core/generator.h
index 29ea92cbe6d94..3263b2a525732 100644
--- a/paddle/phi/core/generator.h
+++ b/paddle/phi/core/generator.h
@@ -49,12 +49,6 @@ class Generator {
   virtual std::pair<uint64_t, uint64_t> IncrementOffset(
       uint64_t increament_offset) = 0;
 
-  // NOTE(zhiqiu): is_init_py_ is used to make generator be compatible with
-  // old seed, and it should be removed after all random-related operators
-  // and unittests upgrades to use generator.
-  virtual void SetIsInitPy(bool) = 0;
-  virtual bool GetIsInitPy() const = 0;
-
   virtual uint64_t get_device_id() = 0;
 };
 
diff --git a/paddle/phi/core/infermeta_utils.cc b/paddle/phi/core/infermeta_utils.cc
index 1d61f55f9dcd2..9d2b85435c7f3 100644
--- a/paddle/phi/core/infermeta_utils.cc
+++ b/paddle/phi/core/infermeta_utils.cc
@@ -65,14 +65,6 @@ const MetaTensor& InferMetaContext::InputAt(size_t idx) const {
   return inputs_.at(idx);
 }
 
-paddle::optional<const MetaTensor&> InferMetaContext::OptionalInputAt(
-    size_t idx) const {
-  const auto& input = inputs_.at(idx);
-  return input.initialized()
-             ? paddle::optional<const MetaTensor&>{input}
-             : paddle::optional<const MetaTensor&>{paddle::none};
-}
-
 std::vector<const MetaTensor*> InferMetaContext::InputsBetween(
     size_t start, size_t end) const {
   std::vector<const MetaTensor*> result;
@@ -86,7 +78,7 @@ std::vector<const MetaTensor*> InferMetaContext::InputsBetween(
   return result;
 }
 
-paddle::optional<const std::vector<const MetaTensor*>>
+paddle::optional<std::vector<const MetaTensor*>>
 InferMetaContext::OptionalInputsBetween(size_t start, size_t end) const {
   const auto& first = inputs_.at(start);
 
@@ -99,9 +91,9 @@ InferMetaContext::OptionalInputsBetween(size_t start, size_t end) const {
       result.emplace_back(in.initialized() ? &in : nullptr);
     }
 
-    return paddle::optional<const std::vector<const MetaTensor*>>(result);
+    return paddle::optional<std::vector<const MetaTensor*>>(std::move(result));
   }
-  return paddle::optional<const std::vector<const MetaTensor*>>(paddle::none);
+  return paddle::none;
 }
 
 MetaTensor* InferMetaContext::MutableOutputAt(size_t idx) {
diff --git a/paddle/phi/core/infermeta_utils.h b/paddle/phi/core/infermeta_utils.h
index b974f2c868a8a..d27d8bc7624be 100644
--- a/paddle/phi/core/infermeta_utils.h
+++ b/paddle/phi/core/infermeta_utils.h
@@ -50,11 +50,10 @@ class InferMetaContext {
       paddle::small_vector<MetaTensor, phi::kOutputSmallVectorSize> outputs);
 
   virtual const MetaTensor& InputAt(size_t idx) const;
-  virtual paddle::optional<const MetaTensor&> OptionalInputAt(size_t idx) const;
 
   virtual std::vector<const MetaTensor*> InputsBetween(size_t start,
                                                        size_t end) const;
-  virtual paddle::optional<const std::vector<const MetaTensor*>>
+  virtual paddle::optional<std::vector<const MetaTensor*>>
   OptionalInputsBetween(size_t start, size_t end) const;
 
   virtual MetaTensor* MutableOutputAt(size_t idx);
@@ -151,24 +150,6 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> {
     }
   };
 
-  template <typename... Tail>
-  struct InferMetaFnCallHelper<paddle::optional<const MetaTensor&>, Tail...> {
-    template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
-    static void Call(InferMetaContext* ctx, PreviousArgs&... pargs) {
-      static_assert(attr_idx == 0,
-                    "InferMeta's Input should appear before Attributes.");
-      static_assert(out_idx == 0,
-                    "InferMeta's Input should appear before Outputs.");
-      const std::pair<int, int> range = ctx->InputRangeAt(in_idx);
-      auto arg = ctx->OptionalInputAt(range.first);
-
-      InferMetaFnCallHelper<
-          Tail...>::template Call<in_idx + 1, attr_idx, out_idx>(ctx,
-                                                                 pargs...,
-                                                                 arg);
-    }
-  };
-
   template <typename... Tail>
   struct InferMetaFnCallHelper<const std::vector<const MetaTensor*>&, Tail...> {
     template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
@@ -189,7 +170,7 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> {
 
   template <typename... Tail>
   struct InferMetaFnCallHelper<
-      paddle::optional<const std::vector<const MetaTensor*>>,
+      const paddle::optional<std::vector<const MetaTensor*>>&,
       Tail...> {
     template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
     static void Call(InferMetaContext* ctx, PreviousArgs&... pargs) {
@@ -198,7 +179,7 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> {
       static_assert(out_idx == 0,
                     "InferMeta's Input should appear before Outputs.");
       const std::pair<int, int> range = ctx->InputRangeAt(in_idx);
-      paddle::optional<const std::vector<const MetaTensor*>> arg =
+      paddle::optional<std::vector<const MetaTensor*>> arg =
           ctx->OptionalInputsBetween(range.first, range.second);
       InferMetaFnCallHelper<
           Tail...>::template Call<in_idx + 1, attr_idx, out_idx>(ctx,
diff --git a/paddle/phi/core/kernel_context.h b/paddle/phi/core/kernel_context.h
index 8b43239d352b3..0f155f445ec9b 100644
--- a/paddle/phi/core/kernel_context.h
+++ b/paddle/phi/core/kernel_context.h
@@ -81,11 +81,11 @@ class KernelContext {
   }
 
   template <typename TensorType>
-  paddle::optional<const TensorType&> OptionalInputAt(size_t idx) const {
-    const auto& input = inputs_.at(idx);
-    return input ? paddle::optional<const TensorType&>{static_cast<
-                       const TensorType&>(*input)}
-                 : paddle::optional<const TensorType&>{paddle::none};
+  paddle::optional<TensorType> OptionalInputAt(size_t idx) const {
+    const auto* input = inputs_.at(idx);
+    return input ? paddle::make_optional<TensorType>(
+                       *(static_cast<const TensorType*>(input)))
+                 : paddle::none;
   }
 
   template <typename TensorType>
@@ -99,7 +99,7 @@ class KernelContext {
   }
 
   template <typename TensorType>
-  paddle::optional<const std::vector<const TensorType*>> OptionalInputsBetween(
+  paddle::optional<std::vector<const TensorType*>> OptionalInputsBetween(
       size_t start, size_t end) {
     const auto& first = inputs_.at(start);
 
@@ -109,9 +109,9 @@ class KernelContext {
         auto* t = static_cast<const TensorType*>(inputs_.at(i));
         v.emplace_back(t);
       }
-      return paddle::optional<const std::vector<const TensorType*>>(v);
+      return paddle::optional<std::vector<const TensorType*>>(std::move(v));
     }
-    return paddle::optional<const std::vector<const TensorType*>>(paddle::none);
+    return paddle::none;
   }
 
   template <typename TensorType>
diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h
index 36ab9c081cc37..41e1e2b53a9e9 100644
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -76,20 +76,20 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
                               default_key.dtype(),
                               arg_type);
       } else if (arg_type == std::type_index(typeid(
-                                 paddle::optional<const DenseTensor&>))) {
+                                 const paddle::optional<DenseTensor>&))) {
         args_def->AppendInput(default_key.backend(),
                               default_tensor_layout,
                               default_key.dtype(),
                               arg_type);
-      } else if (arg_type == std::type_index(typeid(
-                                 paddle::optional<
-                                     const std::vector<const DenseTensor*>>))) {
+      } else if (arg_type ==
+                 std::type_index(typeid(const paddle::optional<
+                                        std::vector<const DenseTensor*>>&))) {
         args_def->AppendInput(default_key.backend(),
                               default_tensor_layout,
                               default_key.dtype(),
                               arg_type);
       } else if (arg_type == std::type_index(typeid(
-                                 paddle::optional<const SelectedRows&>))) {
+                                 const paddle::optional<SelectedRows>&))) {
         args_def->AppendInput(default_key.backend(),
                               default_tensor_layout,
                               default_key.dtype(),
diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h
index f548d1da2d4e7..d4765d1c4c3b4 100644
--- a/paddle/phi/core/kernel_utils.h
+++ b/paddle/phi/core/kernel_utils.h
@@ -85,7 +85,7 @@ namespace phi {
 
 #define PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(tensor_type)     \
   template <typename... Tail>                                              \
-  struct KernelCallHelper<paddle::optional<const tensor_type&>, Tail...> { \
+  struct KernelCallHelper<const paddle::optional<tensor_type>&, Tail...> { \
     template <int dev_ctx_idx,                                             \
               int in_idx,                                                  \
               int attr_idx,                                                \
@@ -129,7 +129,7 @@ namespace phi {
 #define PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_MULTI_INPUT(tensor_type)  \
   template <typename... Tail>                                                 \
   struct KernelCallHelper<                                                    \
-      paddle::optional<const std::vector<const tensor_type*>>,                \
+      const paddle::optional<std::vector<const tensor_type*>>&,               \
       Tail...> {                                                              \
     template <int dev_ctx_idx,                                                \
               int in_idx,                                                     \
@@ -142,7 +142,7 @@ namespace phi {
       static_assert(out_idx == 0,                                             \
                     "Kernel's Input should appear before Outputs.");          \
       const std::pair<int, int>& range = ctx->InputRangeAt(in_idx);           \
-      paddle::optional<const std::vector<const tensor_type*>> arg =           \
+      paddle::optional<std::vector<const tensor_type*>> arg =                 \
           ctx->OptionalInputsBetween<tensor_type>(range.first, range.second); \
       KernelCallHelper<Tail...>::                                             \
           template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>(       \
diff --git a/paddle/phi/core/meta_tensor.h b/paddle/phi/core/meta_tensor.h
index 3cdbfda61d69c..d277f32d8ea9a 100644
--- a/paddle/phi/core/meta_tensor.h
+++ b/paddle/phi/core/meta_tensor.h
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/tensor_base.h"
 #include "paddle/phi/core/tensor_meta.h"
 
+#include "glog/logging.h"
+
 namespace phi {
 
 // TODO(chenweihang): add other flags if needed
@@ -37,7 +39,9 @@ struct MetaConfig {
 
 class MetaTensor {
  public:
-  MetaTensor() = default;
+  typedef void (*unspecified_bool_type)();
+
+  MetaTensor() : tensor_(nullptr) {}
 
   // supporting implicit construction is easier to use
   MetaTensor(TensorBase* tensor) : tensor_(tensor) {}  // NOLINT
@@ -66,12 +70,22 @@ class MetaTensor {
 
   virtual bool initialized() const;
 
+  virtual operator unspecified_bool_type() const {
+    return tensor_ == nullptr ? 0 : unspecified_bool_true;
+  }
+
+  virtual bool operator!() const { return tensor_ == nullptr; }
+
+ protected:
+  static void unspecified_bool_true() {}
+
  private:
   // Because the lod in compiletime and runtime is different,
   // so `LoD` cannot in public methods
   const LoD& lod() const;
   TensorBase* tensor() const;
-  TensorBase* tensor_;
+
+  TensorBase* tensor_ = nullptr;
 };
 
 }  // namespace phi
diff --git a/paddle/phi/core/storage.h b/paddle/phi/core/storage.h
deleted file mode 100644
index 24dc2c4a4f90b..0000000000000
--- a/paddle/phi/core/storage.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstddef>
-
-#include "boost/intrusive_ptr.hpp"
-#include "paddle/phi/common/place.h"
-#include "paddle/phi/core/allocator.h"
-#include "paddle/phi/core/utils/intrusive_ptr.h"
-#include "paddle/phi/core/utils/intrusive_ref_counter.h"
-#include "paddle/phi/core/utils/type_info.h"
-
-namespace phi {
-
-/// \brief The interface of contiguous storage used for the dense tensor.
-/// It should be used in conjunction with the intrusive pointer. We prohibit
-/// all default copy operations to ensure the integrity of the package.
-class Storage : public intrusive_ref_counter<Storage> {
- public:
-  Storage() = default;
-  Storage(const Storage&) = delete;
-
-  /* @jim19930609: Following interfaces will be modified/replaced/removed
-                   as soon as the new Allocation - Allocator design get
-     finalized.
-    */
-
-  /*   --------- shared_ptr<Allocation> -------- */
-  // Initialize a Storage with unique Allocation
-  explicit Storage(std::shared_ptr<phi::Allocation>&& data)
-      : data_(std::move(data)) {}
-
-  // Initialize a Storage shareing Allocation with another storage
-  explicit Storage(const std::shared_ptr<phi::Allocation>& data)
-      : data_(data) {}
-
-  void* data() const {
-    return data_ ? reinterpret_cast<void*>(
-                       reinterpret_cast<uintptr_t>(data_->ptr()))
-                 : nullptr;
-  }
-
-  const std::shared_ptr<phi::Allocation>& data_shared() const { return data_; }
-
-  virtual void set_data_shared(
-      const std::shared_ptr<phi::Allocation>& holder) = 0;
-
-  virtual std::shared_ptr<phi::Allocation>&& move_data_shared() = 0;
-
-  virtual void ReallocShared(size_t n) {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "ReallocShared has not been overrided by the current Storage"));
-  }
-  /* --------- shared_ptr<Allocation> -------- */
-
-  virtual ~Storage() = default;
-
-  virtual void Clear() = 0;
-
-  virtual size_t size() const = 0;
-  virtual const Place& place() const = 0;
-  virtual bool OwnsMemory() const = 0;
-  virtual void Realloc(size_t n) = 0;
-
- protected:
-  std::shared_ptr<phi::Allocation> data_;
-};
-
-class TensorStorage : public Storage {
- public:
-  explicit TensorStorage(Allocator* a) : alloc_(a) {}
-
-  TensorStorage(Allocator* a, size_t size)
-      : Storage(a->Allocate(size)), alloc_(a) {
-    size_ = data_->size();
-  }
-
-  void Clear() override {
-    data_ = nullptr;
-    size_ = 0;
-  }
-
-  void Realloc(size_t size) override;
-
-  ~TensorStorage() = default;
-
-  static const char* name() { return "TensorStorage"; }
-
-  size_t size() const noexcept override { return size_; }
-
-  const Place& place() const override {
-    if (!data_) {
-      PADDLE_THROW(phi::errors::Unimplemented(
-          "Unable to visit place: either data_ or alloc_ has to be initialized "
-          "first."));
-    }
-    return data_->place();
-  }
-
-  bool OwnsMemory() const noexcept override { return true; }
-
-  void set_data_shared(
-      const std::shared_ptr<phi::Allocation>& holder) override {
-    CHECK(holder);
-    data_ = holder;
-    size_ = holder->size();
-  }
-
-  std::shared_ptr<phi::Allocation>&& move_data_shared() override {
-    size_ = 0;
-    return std::move(data_);
-  }
-
- private:
-  Allocator* alloc_;
-  int64_t size_{0};
-};
-
-}  // namespace phi
diff --git a/paddle/phi/core/string_tensor.cc b/paddle/phi/core/string_tensor.cc
index 35444dc33fe78..0a4e0d6191510 100644
--- a/paddle/phi/core/string_tensor.cc
+++ b/paddle/phi/core/string_tensor.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/core/string_tensor.h"
-#include "paddle/phi/api/lib/utils/storage.h"
+#include "paddle/fluid/memory/malloc.h"
 
 namespace phi {
 
diff --git a/paddle/phi/core/string_tensor.h b/paddle/phi/core/string_tensor.h
index 916c2a2bd4a4e..94c9974f4ad74 100644
--- a/paddle/phi/core/string_tensor.h
+++ b/paddle/phi/core/string_tensor.h
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include "paddle/phi/common/pstring.h"
 #include "paddle/phi/core/allocator.h"
-#include "paddle/phi/core/storage.h"
 #include "paddle/phi/core/tensor_base.h"
 #include "paddle/phi/core/tensor_meta.h"
 
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
index 602942abf4d34..521eb03fd770f 100644
--- a/paddle/phi/infermeta/backward.cc
+++ b/paddle/phi/infermeta/backward.cc
@@ -188,7 +188,7 @@ void CrossEntropyWithSoftmaxGradInferMeta(const MetaTensor& label,
 void DeformableConvGradInferMeta(const MetaTensor& x,
                                  const MetaTensor& offset,
                                  const MetaTensor& filter,
-                                 paddle::optional<const MetaTensor&> mask,
+                                 const MetaTensor& mask,
                                  const MetaTensor& out_grad,
                                  const std::vector<int>& strides,
                                  const std::vector<int>& paddings,
@@ -202,7 +202,7 @@ void DeformableConvGradInferMeta(const MetaTensor& x,
                                  MetaTensor* mask_grad) {
   GeneralTernaryGradInferMeta(x, offset, filter, dx, offset_grad, filter_grad);
   if (mask) {
-    UnchangedInferMeta(mask.get(), mask_grad);
+    UnchangedInferMeta(mask, mask_grad);
   }
 }
 
@@ -312,6 +312,62 @@ void GumbelSoftmaxGradInferMeta(const MetaTensor& out,
   dx->share_meta(dout);
 }
 
+void InstanceNormGradInferMeta(const MetaTensor& x,
+                               const MetaTensor& y_grad,
+                               const MetaTensor& scale,
+                               const MetaTensor& saved_mean,
+                               const MetaTensor& saved_variance,
+                               float epsilon,
+                               MetaTensor* x_grad,
+                               MetaTensor* scale_grad,
+                               MetaTensor* bias_grad) {
+  PADDLE_ENFORCE_NE(
+      x_grad,
+      nullptr,
+      phi::errors::InvalidArgument(
+          "The X@GRAD in InstanceNormGradInferMeta can't be nullptr."));
+  const auto x_dims = x.dims();
+  const int C = x_dims[1];
+  x_grad->set_dims(x_dims);
+  x_grad->set_dtype(x.dtype());
+  x_grad->set_layout(x.layout());
+  if (scale_grad) {
+    scale_grad->set_dims({C});
+  }
+  if (bias_grad) {
+    bias_grad->set_dims({C});
+  }
+}
+void InstanceNormDoubleGradInferMeta(const MetaTensor& x,
+                                     const MetaTensor& scale,
+                                     const MetaTensor& saved_mean,
+                                     const MetaTensor& saved_variance,
+                                     const MetaTensor& dy,
+                                     const MetaTensor& ddx,
+                                     const MetaTensor& ddscale,
+                                     const MetaTensor& ddbias,
+                                     float epsilon,
+                                     MetaTensor* dx,
+                                     MetaTensor* dscale,
+                                     MetaTensor* ddy) {
+  PADDLE_ENFORCE_NE(
+      dx,
+      nullptr,
+      phi::errors::InvalidArgument(
+          "The DX in InstanceNormDoubleGradInferMeta can't be nullptr."));
+  const auto x_dims = x.dims();
+  const int C = x_dims[1];
+  dx->set_dims(x_dims);
+  dx->set_dtype(x.dtype());
+  dx->set_layout(x.layout());
+  if (dscale) {
+    dscale->set_dims({C});
+  }
+  if (ddy) {
+    ddy->share_dims(x);
+  }
+}
+
 void KernelWithXShapeInferMeta(const MetaTensor& xshape, MetaTensor* dx) {
   auto xshape_dims = xshape.dims();
   auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
@@ -377,9 +433,20 @@ void MultiplexGradInferMeta(const MetaTensor& ids,
   }
 }
 
+void NanmedianGradInferMeta(const MetaTensor& x,
+                            const MetaTensor& median_index,
+                            const MetaTensor& out_grad,
+                            const IntArray& axes,
+                            bool keep_dim,
+                            MetaTensor* x_grad) {
+  auto x_dims = x.dims();
+  x_grad->set_dims(x_dims);
+  x_grad->set_dtype(x.dtype());
+}
+
 void NllLossGradInferMeta(const MetaTensor& x,
                           const MetaTensor& label,
-                          paddle::optional<const MetaTensor&> weight,
+                          const MetaTensor& weight,
                           const MetaTensor& total_weight,
                           const MetaTensor& out_grad,
                           int64_t ignore_index,
@@ -492,7 +559,7 @@ void PoolGradInferMeta(const MetaTensor& x,
 
 void PsroiPoolGradInferMeta(const MetaTensor& x,
                             const MetaTensor& rois,
-                            paddle::optional<const MetaTensor&> rois_num,
+                            const MetaTensor& rois_num,
                             const MetaTensor& dout,
                             int pooled_height,
                             int pooled_width,
diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h
index c35b58d0f56e4..93e2d4c43bc3f 100644
--- a/paddle/phi/infermeta/backward.h
+++ b/paddle/phi/infermeta/backward.h
@@ -87,7 +87,7 @@ void CrossEntropyWithSoftmaxGradInferMeta(const MetaTensor& label,
 void DeformableConvGradInferMeta(const MetaTensor& x,
                                  const MetaTensor& offset,
                                  const MetaTensor& filter,
-                                 paddle::optional<const MetaTensor&> mask,
+                                 const MetaTensor& mask,
                                  const MetaTensor& out_grad,
                                  const std::vector<int>& strides,
                                  const std::vector<int>& paddings,
@@ -144,6 +144,29 @@ void GumbelSoftmaxGradInferMeta(const MetaTensor& out,
                                 int axis,
                                 MetaTensor* dx);
 
+void InstanceNormGradInferMeta(const MetaTensor& x,
+                               const MetaTensor& y_grad,
+                               const MetaTensor& scale,
+                               const MetaTensor& saved_mean,
+                               const MetaTensor& saved_variance,
+                               float epsilon,
+                               MetaTensor* x_grad,
+                               MetaTensor* scale_grad,
+                               MetaTensor* bias_grad);
+
+void InstanceNormDoubleGradInferMeta(const MetaTensor& x,
+                                     const MetaTensor& scale,
+                                     const MetaTensor& saved_mean,
+                                     const MetaTensor& saved_variance,
+                                     const MetaTensor& dy,
+                                     const MetaTensor& ddx,
+                                     const MetaTensor& ddscale,
+                                     const MetaTensor& ddbias,
+                                     float epsilon,
+                                     MetaTensor* dx,
+                                     MetaTensor* dscale,
+                                     MetaTensor* ddy);
+
 void KernelWithXShapeInferMeta(const MetaTensor& xshape, MetaTensor* dx);
 
 void MaxPoolWithIndexGradInferMeta(const MetaTensor& x,
@@ -168,9 +191,16 @@ void MultiplexGradInferMeta(const MetaTensor& ids,
                             const MetaTensor& out_grad,
                             std::vector<MetaTensor*> ins_grad);
 
+void NanmedianGradInferMeta(const MetaTensor& x,
+                            const MetaTensor& median_index,
+                            const MetaTensor& out_grad,
+                            const IntArray& axes,
+                            bool keep_dim,
+                            MetaTensor* x_grad);
+
 void NllLossGradInferMeta(const MetaTensor& input,
                           const MetaTensor& label,
-                          paddle::optional<const MetaTensor&> weight,
+                          const MetaTensor& weight,
                           const MetaTensor& total_weight,
                           const MetaTensor& out_grad,
                           int64_t ignore_index,
@@ -185,7 +215,7 @@ void PixelUnshuffleGradInferMeta(const MetaTensor& out_grad,
 
 void PsroiPoolGradInferMeta(const MetaTensor& x,
                             const MetaTensor& rois,
-                            paddle::optional<const MetaTensor&> rois_num,
+                            const MetaTensor& rois_num,
                             const MetaTensor& dout,
                             int pooled_height,
                             int pooled_width,
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 837a43905e723..a8d5ad564fe9b 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -201,7 +201,7 @@ void BCELossInferMeta(const MetaTensor& input,
 }
 
 void BincountInferMeta(const MetaTensor& x,
-                       const paddle::optional<const MetaTensor&> weights,
+                       const MetaTensor& weights,
                        int minlength,
                        MetaTensor* out) {
   auto input_dim = x.dims();
@@ -220,8 +220,10 @@ void BincountInferMeta(const MetaTensor& x,
                                    "But the dimension of Input(X) is [%d]",
                                    input_dim.size()));
 
-  if (weights.is_initialized()) {
-    auto weights_dim = weights->dims();
+  VLOG(1) << "####### CHECK weights";
+  if (weights) {
+    auto weights_dim = weights.dims();
+    VLOG(1) << "##### weights_dim " << weights_dim;
     PADDLE_ENFORCE_EQ(weights_dim.size(),
                       1,
                       phi::errors::InvalidArgument(
@@ -241,8 +243,8 @@ void BincountInferMeta(const MetaTensor& x,
             input_dim));
   }
   out->set_dims(phi::make_ddim({-1}));
-  if (weights.is_initialized()) {
-    out->set_dtype(weights->dtype());
+  if (weights) {
+    out->set_dtype(weights.dtype());
   } else {
     out->set_dtype(x.dtype());
   }
@@ -864,7 +866,7 @@ void DistInferMeta(const MetaTensor& x,
 }
 
 void DropoutInferMeta(const MetaTensor& x,
-                      paddle::optional<const MetaTensor&> seed_tensor,
+                      const MetaTensor& seed_tensor,
                       float p,
                       bool is_test,
                       const std::string& mode,
@@ -981,8 +983,34 @@ void ElementwiseRawInferMeta(const MetaTensor& x,
   out->share_lod(x);
 }
 
+void EmbeddingInferMeta(const MetaTensor& x,
+                        const MetaTensor& weight,
+                        int64_t padding_idx,
+                        bool sparse,
+                        MetaTensor* out) {
+  const auto& table_dims = weight.dims();
+  const auto& ids_dims = x.dims();
+  int ids_rank = ids_dims.size();
+  VLOG(5) << "ids rank is " << ids_rank << std::endl;
+  PADDLE_ENFORCE_EQ(
+      table_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "ShapeError: The dimensions of the 'lookup table' must be 2. "
+          "But received lookup table's dimensions = %d, "
+          "lookup table's shape = [%s].",
+          table_dims.size(),
+          table_dims));
+
+  auto output_dims = phi::vectorize(ids_dims);
+  output_dims.push_back(table_dims[1]);
+  out->set_dims(phi::make_ddim(output_dims));
+  out->set_dtype(weight.dtype());
+  out->share_lod(x);
+}
+
 void ExpandAsInferMeta(const MetaTensor& x,
-                       paddle::optional<const MetaTensor&> y,
+                       const MetaTensor& y,
                        const std::vector<int>& target_shape,
                        MetaTensor* out) {
 #define MAX_RANK_SUPPORTED 6
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index 192fa214c905f..2cd34406fc2d2 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -56,7 +56,7 @@ void BCELossInferMeta(const MetaTensor& input,
                       MetaConfig config = MetaConfig());
 
 void BincountInferMeta(const MetaTensor& x,
-                       const paddle::optional<const MetaTensor&> weights,
+                       const MetaTensor& weights,
                        int minlength,
                        MetaTensor* out);
 
@@ -136,7 +136,7 @@ void DistInferMeta(const MetaTensor& x,
 void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out);
 
 void DropoutInferMeta(const MetaTensor& x,
-                      paddle::optional<const MetaTensor&> seed_tensor,
+                      const MetaTensor& seed_tensor,
                       float p,
                       bool is_test,
                       const std::string& mode,
@@ -154,8 +154,14 @@ void ElementwiseRawInferMeta(const MetaTensor& x_meta,
                              int axis,
                              MetaTensor* out);
 
+void EmbeddingInferMeta(const MetaTensor& x,
+                        const MetaTensor& weight,
+                        int64_t padding_idx,
+                        bool sparse,
+                        MetaTensor* out);
+
 void ExpandAsInferMeta(const MetaTensor& x,
-                       paddle::optional<const MetaTensor&> y,
+                       const MetaTensor& y,
                        const std::vector<int>& target_shape,
                        MetaTensor* out);
 
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 48c40673ab819..63f0d0c1eeb28 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -100,8 +100,8 @@ void AdamInferMeta(const MetaTensor& param,
                    const MetaTensor& moment2,
                    const MetaTensor& beta1_pow,
                    const MetaTensor& beta2_pow,
-                   paddle::optional<const MetaTensor&> master_param,
-                   paddle::optional<const MetaTensor&> skip_update,
+                   const MetaTensor& master_param,
+                   const MetaTensor& skip_update,
                    const Scalar& beta1,
                    const Scalar& beta2,
                    const Scalar& epsilon,
@@ -238,8 +238,8 @@ void AdamwInferMeta(const MetaTensor& param,
                     const MetaTensor& moment2,
                     const MetaTensor& beta1_pow,
                     const MetaTensor& beta2_pow,
-                    paddle::optional<const MetaTensor&> master_param,
-                    paddle::optional<const MetaTensor&> skip_update,
+                    const MetaTensor& master_param,
+                    const MetaTensor& skip_update,
                     const Scalar& beta1,
                     const Scalar& beta2,
                     const Scalar& epsilon,
@@ -580,7 +580,7 @@ void BatchNormInferInferMeta(const MetaTensor& x,
 void BilinearTensorProductInferMeta(const MetaTensor& x,
                                     const MetaTensor& y,
                                     const MetaTensor& weight,
-                                    paddle::optional<const MetaTensor&> bias,
+                                    const MetaTensor& bias,
                                     MetaTensor* out,
                                     MetaConfig config) {
   auto x_dims = x.dims();
@@ -619,8 +619,8 @@ void BilinearTensorProductInferMeta(const MetaTensor& x,
                         "The second dimension of input(Y) must be equal to "
                         "the third dimension of the input(Weight)."));
 
-  if (bias.get_ptr()) {
-    auto bias_dims = bias->dims();
+  if (bias) {
+    auto bias_dims = bias.dims();
     PADDLE_ENFORCE_EQ(bias_dims.size(),
                       2UL,
                       errors::InvalidArgument(
@@ -772,7 +772,7 @@ inline int ConvOutputSize(
 void DeformableConvInferMeta(const MetaTensor& x,
                              const MetaTensor& offset,
                              const MetaTensor& filter,
-                             paddle::optional<const MetaTensor&> mask,
+                             const MetaTensor& mask,
                              const std::vector<int>& strides,
                              const std::vector<int>& paddings,
                              const std::vector<int>& dilations,
@@ -918,7 +918,7 @@ void DeformableConvInferMeta(const MetaTensor& x,
             deformable_groups));
 
     if (mask) {
-      auto mask_dims = mask->dims();
+      auto mask_dims = mask.dims();
       PADDLE_ENFORCE_EQ(output_shape[2],
                         mask_dims[2],
                         phi::errors::InvalidArgument(
@@ -958,9 +958,9 @@ void DeformableConvInferMeta(const MetaTensor& x,
 void HierarchicalSigmoidInferMeta(const MetaTensor& x,
                                   const MetaTensor& w,
                                   const MetaTensor& label,
-                                  paddle::optional<const MetaTensor&> path,
-                                  paddle::optional<const MetaTensor&> code,
-                                  paddle::optional<const MetaTensor&> bias,
+                                  const MetaTensor& path,
+                                  const MetaTensor& code,
+                                  const MetaTensor& bias,
                                   int num_classes,
                                   bool remote_prefetch,
                                   int trainer_id,
@@ -991,9 +991,9 @@ void HierarchicalSigmoidInferMeta(const MetaTensor& x,
 
 static void Interpolate1DInferShapeCheck(
     const MetaTensor& x,
-    paddle::optional<const MetaTensor&> out_size,
-    paddle::optional<const std::vector<const MetaTensor*>> size_tensor,
-    paddle::optional<const MetaTensor&> scale_tensor,
+    const MetaTensor& out_size,
+    const paddle::optional<std::vector<const MetaTensor*>>& size_tensor,
+    const MetaTensor& scale_tensor,
     const std::string& data_layout_str,
     int out_d,
     int out_h,
@@ -1048,7 +1048,7 @@ static void Interpolate1DInferShapeCheck(
 
   int out_w_tmp;
   if (scale_tensor) {
-    auto scale_tensor_dim = scale_tensor->dims();
+    auto scale_tensor_dim = scale_tensor.dims();
     PADDLE_ENFORCE_EQ(
         scale_tensor_dim.size(),
         1,
@@ -1086,7 +1086,7 @@ static void Interpolate1DInferShapeCheck(
   }
 
   if (out_size && config.is_runtime) {
-    auto out_size_dim = out_size->dims();
+    auto out_size_dim = out_size.dims();
     PADDLE_ENFORCE_EQ(
         out_size_dim.size(),
         1,
@@ -1118,9 +1118,9 @@ static void Interpolate1DInferShapeCheck(
 
 static void Interpolate2DInferShapeCheck(
     const MetaTensor& x,
-    paddle::optional<const MetaTensor&> out_size,
-    paddle::optional<const std::vector<const MetaTensor*>> size_tensor,
-    paddle::optional<const MetaTensor&> scale_tensor,
+    const MetaTensor& out_size,
+    const paddle::optional<std::vector<const MetaTensor*>>& size_tensor,
+    const MetaTensor& scale_tensor,
     const std::string& data_layout_str,
     int out_d,
     int out_h,
@@ -1178,7 +1178,7 @@ static void Interpolate2DInferShapeCheck(
 
   int out_h_tmp, out_w_tmp;
   if (scale_tensor) {
-    auto scale_tensor_dim = scale_tensor->dims();
+    auto scale_tensor_dim = scale_tensor.dims();
     PADDLE_ENFORCE_EQ(
         scale_tensor_dim.size(),
         1,
@@ -1231,7 +1231,7 @@ static void Interpolate2DInferShapeCheck(
   }
 
   if (out_size && config.is_runtime) {
-    auto out_size_dim = out_size->dims();
+    auto out_size_dim = out_size.dims();
     PADDLE_ENFORCE_EQ(
         out_size_dim.size(),
         1,
@@ -1263,9 +1263,9 @@ static void Interpolate2DInferShapeCheck(
 
 static void Interpolate3DInferShapeCheck(
     const MetaTensor& x,
-    paddle::optional<const MetaTensor&> out_size,
-    paddle::optional<const std::vector<const MetaTensor*>> size_tensor,
-    paddle::optional<const MetaTensor&> scale_tensor,
+    const MetaTensor& out_size,
+    const paddle::optional<std::vector<const MetaTensor*>>& size_tensor,
+    const MetaTensor& scale_tensor,
     const std::string& data_layout_str,
     int out_d,
     int out_h,
@@ -1321,7 +1321,7 @@ static void Interpolate3DInferShapeCheck(
 
   int out_d_tmp, out_h_tmp, out_w_tmp;
   if (scale_tensor) {
-    auto scale_tensor_dim = scale_tensor->dims();
+    auto scale_tensor_dim = scale_tensor.dims();
     PADDLE_ENFORCE_EQ(
         scale_tensor_dim.size(),
         1,
@@ -1389,7 +1389,7 @@ static void Interpolate3DInferShapeCheck(
   }
 
   if (out_size && config.is_runtime) {
-    auto out_size_dim = out_size->dims();
+    auto out_size_dim = out_size.dims();
     PADDLE_ENFORCE_EQ(
         out_size_dim.size(),
         1,
@@ -1419,9 +1419,9 @@ static void Interpolate3DInferShapeCheck(
 
 void InterpolateInferMeta(
     const MetaTensor& x,
-    paddle::optional<const MetaTensor&> out_size,
-    paddle::optional<const std::vector<const MetaTensor*>> size_tensor,
-    paddle::optional<const MetaTensor&> scale_tensor,
+    const MetaTensor& out_size,
+    const paddle::optional<std::vector<const MetaTensor*>>& size_tensor,
+    const MetaTensor& scale_tensor,
     const std::string& data_layout_str,
     int out_d,
     int out_h,
@@ -1546,7 +1546,7 @@ void MomentumInferMeta(const MetaTensor& param,
                        const MetaTensor& grad,
                        const MetaTensor& velocity,
                        const MetaTensor& learning_rate,
-                       paddle::optional<const MetaTensor&> master_param,
+                       const MetaTensor& master_param,
                        float mu,
                        bool use_nesterov,
                        const std::string& regularization_method,
@@ -1709,7 +1709,7 @@ void MultiplexInferMeta(const std::vector<const MetaTensor*>& ins,
 
 void PsroiPoolInferMeta(const MetaTensor& x,
                         const MetaTensor& rois,
-                        paddle::optional<const MetaTensor&> rois_num,
+                        const MetaTensor& rois_num,
                         int pooled_height,
                         int pooled_width,
                         int output_channels,
@@ -1732,8 +1732,8 @@ void PsroiPoolInferMeta(const MetaTensor& x,
                     errors::InvalidArgument(
                         "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
                         "given as [(x1, y1, x2, y2), ...]"));
-  if (rois_num.get_ptr()) {
-    auto rois_num_dims = rois_num->dims();
+  if (rois_num) {
+    auto rois_num_dims = rois_num.dims();
     PADDLE_ENFORCE_EQ(
         rois_num_dims.size(),
         1,
@@ -1787,7 +1787,7 @@ void RmspropInferMeta(const MetaTensor& param,
                       const MetaTensor& grad,
                       const MetaTensor& moment,
                       const MetaTensor& learning_rate,
-                      paddle::optional<const MetaTensor&> mean_grad,
+                      const MetaTensor& mean_grad,
                       float epsilon,
                       float decay,
                       float momentum,
@@ -1837,14 +1837,14 @@ void RmspropInferMeta(const MetaTensor& param,
   mean_square_out->set_dtype(mean_square.dtype());
   if (centered) {
     mean_grad_out->set_dims(param_dim);
-    mean_grad_out->set_dtype(mean_grad.get_ptr()->dtype());
+    mean_grad_out->set_dtype(mean_grad.dtype());
   }
 }
 
 void RnnInferMeta(const MetaTensor& x,
                   const std::vector<const MetaTensor*>& pre_state,
                   const std::vector<const MetaTensor*>& weight_list,
-                  paddle::optional<const MetaTensor&> sequence_length,
+                  const MetaTensor& sequence_length,
                   float dropout_prob,
                   bool is_bidirec,
                   int input_size,
@@ -1867,7 +1867,7 @@ void RnnInferMeta(const MetaTensor& x,
                                    in_dims.size()));
 
   if (sequence_length) {
-    auto seq_dims = sequence_length->dims();
+    auto seq_dims = sequence_length.dims();
     PADDLE_ENFORCE_EQ(
         in_dims[1],
         seq_dims[0],
@@ -1929,7 +1929,7 @@ void RnnInferMeta(const MetaTensor& x,
 void SgdInferMeta(const MetaTensor& param,
                   const MetaTensor& learning_rate,
                   const MetaTensor& grad,
-                  paddle::optional<const MetaTensor&> master_param,
+                  const MetaTensor& master_param,
                   bool multi_precision,
                   MetaTensor* param_out,
                   MetaTensor* master_param_out) {
@@ -2006,8 +2006,8 @@ void UnchangedMultiInferMeta(const std::vector<const MetaTensor*>& x,
 
 void WarpctcInferMeta(const MetaTensor& logits,
                       const MetaTensor& label,
-                      const paddle::optional<const MetaTensor&> logits_length,
-                      const paddle::optional<const MetaTensor&> labels_length,
+                      const MetaTensor& logits_length,
+                      const MetaTensor& labels_length,
                       int blank,
                       bool norm_by_times,
                       MetaTensor* warpctc_grad,
@@ -2015,7 +2015,7 @@ void WarpctcInferMeta(const MetaTensor& logits,
   auto logits_dims = logits.dims();
   int sequence_width = 0;
 
-  if (logits_length.is_initialized()) {
+  if (logits_length) {
     sequence_width = logits_dims[2];
   } else {
     sequence_width =
@@ -2069,8 +2069,8 @@ void WhereInferMeta(const MetaTensor& condition,
 void GraphReindexInferMeta(const MetaTensor& x,
                            const MetaTensor& neighbors,
                            const MetaTensor& count,
-                           paddle::optional<const MetaTensor&> hashtable_value,
-                           paddle::optional<const MetaTensor&> hashtable_index,
+                           const MetaTensor& hashtable_value,
+                           const MetaTensor& hashtable_index,
                            bool flag_buffer_hashtable,
                            MetaTensor* reindex_src,
                            MetaTensor* reindex_dst,
@@ -2100,8 +2100,8 @@ void GraphReindexInferMeta(const MetaTensor& x,
   GraphReindexShapeCheck(neighbors.dims(), "Neighbors");
   GraphReindexShapeCheck(count.dims(), "Count");
   if (flag_buffer_hashtable) {
-    GraphReindexShapeCheck(hashtable_value->dims(), "HashTable_Value");
-    GraphReindexShapeCheck(hashtable_index->dims(), "HashTable_Index");
+    GraphReindexShapeCheck(hashtable_value.dims(), "HashTable_Value");
+    GraphReindexShapeCheck(hashtable_index.dims(), "HashTable_Index");
   }
 
   reindex_src->set_dims({-1});
@@ -2112,18 +2112,17 @@ void GraphReindexInferMeta(const MetaTensor& x,
   out_nodes->set_dtype(x.dtype());
 }
 
-void GraphSampleNeighborsInferMeta(
-    const MetaTensor& row,
-    const MetaTensor& col_ptr,
-    const MetaTensor& x,
-    paddle::optional<const MetaTensor&> eids,
-    paddle::optional<const MetaTensor&> perm_buffer,
-    int sample_size,
-    bool return_eids,
-    bool flag_perm_buffer,
-    MetaTensor* out,
-    MetaTensor* out_count,
-    MetaTensor* out_eids) {
+void GraphSampleNeighborsInferMeta(const MetaTensor& row,
+                                   const MetaTensor& col_ptr,
+                                   const MetaTensor& x,
+                                   const MetaTensor& eids,
+                                   const MetaTensor& perm_buffer,
+                                   int sample_size,
+                                   bool return_eids,
+                                   bool flag_perm_buffer,
+                                   MetaTensor* out,
+                                   MetaTensor* out_count,
+                                   MetaTensor* out_eids) {
   // GSN: GraphSampleNeighbors
   auto GSNShapeCheck = [](const phi::DDim& dims, std::string tensor_name) {
     if (dims.size() == 2) {
@@ -2149,12 +2148,12 @@ void GraphSampleNeighborsInferMeta(
   GSNShapeCheck(col_ptr.dims(), "Col_Ptr");
   GSNShapeCheck(x.dims(), "X");
   if (return_eids) {
-    GSNShapeCheck(eids->dims(), "Eids");
+    GSNShapeCheck(eids.dims(), "Eids");
     out_eids->set_dims({-1});
     out_eids->set_dtype(row.dtype());
   }
   if (flag_perm_buffer) {
-    GSNShapeCheck(perm_buffer->dims(), "Perm_Buffer");
+    GSNShapeCheck(perm_buffer.dims(), "Perm_Buffer");
   }
 
   out->set_dims({-1});
@@ -2166,7 +2165,7 @@ void GraphSampleNeighborsInferMeta(
 void Yolov3LossInferMeta(const MetaTensor& x,
                          const MetaTensor& gt_box,
                          const MetaTensor& gt_label,
-                         const paddle::optional<const MetaTensor&> gt_score,
+                         const MetaTensor& gt_score,
                          const std::vector<int>& anchors,
                          const std::vector<int>& anchor_mask,
                          int class_num,
@@ -2271,8 +2270,8 @@ void Yolov3LossInferMeta(const MetaTensor& x,
                         "But received class_num(%s) < 0",
                         class_num));
 
-  if (gt_score.get_ptr()) {
-    auto dim_gtscore = gt_score->dims();
+  if (gt_score) {
+    auto dim_gtscore = gt_score.dims();
     PADDLE_ENFORCE_EQ(
         dim_gtscore.size(),
         2,
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index 65b5819b602ba..54c6fccceb9c1 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -76,8 +76,8 @@ void AdamInferMeta(const MetaTensor& param,
                    const MetaTensor& moment2,
                    const MetaTensor& beta1_pow,
                    const MetaTensor& beta2_pow,
-                   paddle::optional<const MetaTensor&> master_param,
-                   paddle::optional<const MetaTensor&> skip_update,
+                   const MetaTensor& master_param,
+                   const MetaTensor& skip_update,
                    const Scalar& beta1,
                    const Scalar& beta2,
                    const Scalar& epsilon,
@@ -99,8 +99,8 @@ void AdamwInferMeta(const MetaTensor& param,
                     const MetaTensor& moment2,
                     const MetaTensor& beta1_pow,
                     const MetaTensor& beta2_pow,
-                    paddle::optional<const MetaTensor&> master_param,
-                    paddle::optional<const MetaTensor&> skip_update,
+                    const MetaTensor& master_param,
+                    const MetaTensor& skip_update,
                     const Scalar& beta1,
                     const Scalar& beta2,
                     const Scalar& epsilon,
@@ -170,7 +170,7 @@ void BatchNormInferInferMeta(const MetaTensor& x,
 void BilinearTensorProductInferMeta(const MetaTensor& x,
                                     const MetaTensor& y,
                                     const MetaTensor& weight,
-                                    paddle::optional<const MetaTensor&> bias,
+                                    const MetaTensor& bias,
                                     MetaTensor* out,
                                     MetaConfig config = MetaConfig());
 
@@ -185,7 +185,7 @@ void ConcatInferMeta(const std::vector<const MetaTensor*>& x,
 void DeformableConvInferMeta(const MetaTensor& x,
                              const MetaTensor& offset,
                              const MetaTensor& filter,
-                             paddle::optional<const MetaTensor&> mask,
+                             const MetaTensor& mask,
                              const std::vector<int>& strides,
                              const std::vector<int>& paddings,
                              const std::vector<int>& dilations,
@@ -198,9 +198,9 @@ void DeformableConvInferMeta(const MetaTensor& x,
 void HierarchicalSigmoidInferMeta(const MetaTensor& x,
                                   const MetaTensor& w,
                                   const MetaTensor& label,
-                                  paddle::optional<const MetaTensor&> path,
-                                  paddle::optional<const MetaTensor&> code,
-                                  paddle::optional<const MetaTensor&> bias,
+                                  const MetaTensor& path,
+                                  const MetaTensor& code,
+                                  const MetaTensor& bias,
                                   int num_classes,
                                   bool remote_prefetch,
                                   int trainer_id,
@@ -214,9 +214,9 @@ void HierarchicalSigmoidInferMeta(const MetaTensor& x,
 
 void InterpolateInferMeta(
     const MetaTensor& x,
-    paddle::optional<const MetaTensor&> out_size,
-    paddle::optional<const std::vector<const MetaTensor*>> size_tensor,
-    paddle::optional<const MetaTensor&> scale_tensor,
+    const MetaTensor& out_size,
+    const paddle::optional<std::vector<const MetaTensor*>>& size_tensor,
+    const MetaTensor& scale_tensor,
     const std::string& data_layout,
     int out_d,
     int out_h,
@@ -241,7 +241,7 @@ void MomentumInferMeta(const MetaTensor& param,
                        const MetaTensor& grad,
                        const MetaTensor& velocity,
                        const MetaTensor& learning_rate,
-                       paddle::optional<const MetaTensor&> master_param,
+                       const MetaTensor& master_param,
                        float mu,
                        bool use_nesterov,
                        const std::string& regularization_method,
@@ -261,7 +261,7 @@ void MultiplexInferMeta(const std::vector<const MetaTensor*>& ins,
 
 void PsroiPoolInferMeta(const MetaTensor& x,
                         const MetaTensor& rois,
-                        paddle::optional<const MetaTensor&> rois_num,
+                        const MetaTensor& rois_num,
                         int pooled_height,
                         int pooled_width,
                         int output_channels,
@@ -273,7 +273,7 @@ void RmspropInferMeta(const MetaTensor& param,
                       const MetaTensor& grad,
                       const MetaTensor& moment,
                       const MetaTensor& learning_rate,
-                      paddle::optional<const MetaTensor&> mean_grad,
+                      const MetaTensor& mean_grad,
                       float epsilon,
                       float decay,
                       float momentum,
@@ -286,7 +286,7 @@ void RmspropInferMeta(const MetaTensor& param,
 void RnnInferMeta(const MetaTensor& x,
                   const std::vector<const MetaTensor*>& pre_state,
                   const std::vector<const MetaTensor*>& weight_list,
-                  paddle::optional<const MetaTensor&> sequence_length,
+                  const MetaTensor& sequence_length,
                   float dropout_prob,
                   bool is_bidirec,
                   int input_size,
@@ -303,7 +303,7 @@ void RnnInferMeta(const MetaTensor& x,
 void SgdInferMeta(const MetaTensor& param,
                   const MetaTensor& learning_rate,
                   const MetaTensor& grad,
-                  paddle::optional<const MetaTensor&> master_param,
+                  const MetaTensor& master_param,
                   bool multi_precision,
                   MetaTensor* param_out,
                   MetaTensor* master_param_out);
@@ -317,8 +317,8 @@ void UnchangedMultiInferMeta(const std::vector<const MetaTensor*>& x,
 
 void WarpctcInferMeta(const MetaTensor& logits,
                       const MetaTensor& label,
-                      const paddle::optional<const MetaTensor&> logits_length,
-                      const paddle::optional<const MetaTensor&> labels_length,
+                      const MetaTensor& logits_length,
+                      const MetaTensor& labels_length,
                       int blank,
                       bool norm_by_times,
                       MetaTensor* warpctc_grad,
@@ -332,30 +332,29 @@ void WhereInferMeta(const MetaTensor& condition,
 void GraphReindexInferMeta(const MetaTensor& x,
                            const MetaTensor& neighbors,
                            const MetaTensor& count,
-                           paddle::optional<const MetaTensor&> hashtable_value,
-                           paddle::optional<const MetaTensor&> hashtable_index,
+                           const MetaTensor& hashtable_value,
+                           const MetaTensor& hashtable_index,
                            bool flag_buffer_hashtable,
                            MetaTensor* reindex_src,
                            MetaTensor* reindex_dst,
                            MetaTensor* out_nodes);
 
-void GraphSampleNeighborsInferMeta(
-    const MetaTensor& row,
-    const MetaTensor& col_ptr,
-    const MetaTensor& x,
-    paddle::optional<const MetaTensor&> eids,
-    paddle::optional<const MetaTensor&> perm_buffer,
-    int sample_size,
-    bool return_eids,
-    bool flag_perm_buffer,
-    MetaTensor* out,
-    MetaTensor* out_count,
-    MetaTensor* out_eids);
+void GraphSampleNeighborsInferMeta(const MetaTensor& row,
+                                   const MetaTensor& col_ptr,
+                                   const MetaTensor& x,
+                                   const MetaTensor& eids,
+                                   const MetaTensor& perm_buffer,
+                                   int sample_size,
+                                   bool return_eids,
+                                   bool flag_perm_buffer,
+                                   MetaTensor* out,
+                                   MetaTensor* out_count,
+                                   MetaTensor* out_eids);
 
 void Yolov3LossInferMeta(const MetaTensor& x,
                          const MetaTensor& gt_box,
                          const MetaTensor& gt_label,
-                         const paddle::optional<const MetaTensor&> gt_score,
+                         const MetaTensor& gt_score,
                          const std::vector<int>& anchors,
                          const std::vector<int>& anchor_mask,
                          int class_num,
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
index ae8c7dd61c3bb..3c2888cee58c7 100644
--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -113,23 +113,23 @@ void AddmmInferMeta(const MetaTensor& input,
                                  "if you put exe.run(startup_program) "
                                  "after optimizer.minimize function."));
   // dim check
-  PADDLE_ENFORCE_EQ(
-      ndim_input,
-      2,
-      errors::InvalidArgument("The input tensor input's dimension must be 2. "
-                              "But received input's dimension = [%s].",
-                              ndim_input));
+  PADDLE_ENFORCE_EQ(ndim_input == 2 || ndim_input == 1,
+                    true,
+                    errors::InvalidArgument(
+                        "The input tensor input's dimension must be 2 or 1. "
+                        "But received input's dimension = [%d].",
+                        ndim_input));
   PADDLE_ENFORCE_EQ(
       ndim_x,
       2,
       errors::InvalidArgument("The input tensor x's dimension must be 2. "
-                              "But received x's dimension = [%s].",
+                              "But received x's dimension = [%d].",
                               ndim_x));
   PADDLE_ENFORCE_EQ(
       ndim_y,
       2,
       errors::InvalidArgument("The input tensor y's dimension must be 2. "
-                              "But received y's dimension = [%s].",
+                              "But received y's dimension = [%d].",
                               ndim_y));
 
   std::vector<int64_t> output_dims;
@@ -191,6 +191,109 @@ void ArangeInferMeta(const MetaTensor& start,
   out->set_dtype(start.dtype());
 }
 
+void InstanceNormInferMeta(const MetaTensor& x,
+                           const MetaTensor& scale,
+                           const MetaTensor& bias,
+                           float epsilon,
+                           MetaTensor* y,
+                           MetaTensor* saved_mean,
+                           MetaTensor* saved_variance,
+                           MetaConfig config) {
+  PADDLE_ENFORCE_NE(y,
+                    nullptr,
+                    phi::errors::InvalidArgument(
+                        "The y in InstanceNormInferMeta can't be nullptr."));
+  PADDLE_ENFORCE_NE(
+      saved_mean,
+      nullptr,
+      phi::errors::InvalidArgument(
+          "The saved_mean in InstanceNormInferMeta can't be nullptr."));
+  PADDLE_ENFORCE_NE(
+      saved_variance,
+      nullptr,
+      phi::errors::InvalidArgument(
+          "The saved_variance in InstanceNormInferMeta can't be nullptr."));
+  const auto x_dims = x.dims();
+  PADDLE_ENFORCE_NE(phi::product(x_dims),
+                    0,
+                    phi::errors::PreconditionNotMet(
+                        "The Input variable X has not "
+                        "been initialized. You may need to confirm "
+                        "if you put exe.run(startup_program) "
+                        "after optimizer.minimize function."));
+  PADDLE_ENFORCE_GE(
+      x_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "ShapeError: the dimension of input X must "
+          "greater than or equal to 2. But received: the shape of input "
+          "X = [%s], the dimension of input X =[%d]",
+          x_dims,
+          x_dims.size()));
+  PADDLE_ENFORCE_LE(
+      x_dims.size(),
+      5,
+      phi::errors::InvalidArgument(
+          "ShapeError: the dimension of input X must "
+          "smaller than or equal to 5, But received: the shape of input "
+          "X = [%s], the dimension of input X = [%d]",
+          x_dims,
+          x_dims.size()));
+  auto N = x_dims[0];
+  auto C = x_dims[1];
+  auto NxC = N * C;
+  if (scale) {
+    auto scale_dim = scale.dims();
+    PADDLE_ENFORCE_EQ(
+        scale_dim.size(),
+        1UL,
+        phi::errors::InvalidArgument(
+            "ShapeError: the dimension of scale must equal to 1."
+            "But received: the shape of scale is [%s], the dimension "
+            "of scale is [%d]",
+            scale_dim,
+            scale_dim.size()));
+    bool check = !((!config.is_runtime) && (phi::product(scale_dim) <= 0));
+    if (check) {
+      PADDLE_ENFORCE_EQ(scale_dim[0],
+                        C,
+                        phi::errors::InvalidArgument(
+                            "ShapeError: the shape of scale must equal to [%d]"
+                            "But received: the shape of scale is [%d]",
+                            C,
+                            scale_dim[0]));
+    }
+  }
+  if (bias) {
+    auto bias_dim = bias.dims();
+    PADDLE_ENFORCE_EQ(
+        bias_dim.size(),
+        1UL,
+        phi::errors::InvalidArgument(
+            "ShapeError: the dimension of bias must equal to 1."
+            "But received: the shape of bias is [%s],the dimension "
+            "of bias is [%d]",
+            bias_dim,
+            bias_dim.size()));
+    bool check = !((!config.is_runtime) && (phi::product(bias_dim) <= 0));
+    if (check) {
+      PADDLE_ENFORCE_EQ(bias_dim[0],
+                        C,
+                        phi::errors::InvalidArgument(
+                            "ShapeError: the shape of bias must equal to [%d]"
+                            "But received: the shape of bias is [%d]",
+                            C,
+                            bias_dim[0]));
+    }
+  }
+  y->set_dims(x_dims);
+  saved_mean->set_dims({NxC});
+  saved_variance->set_dims({NxC});
+  y->share_lod(x);
+  y->set_dtype(x.dtype());
+  y->set_layout(x.layout());
+}
+
 void GraphSendRecvInferMeta(const MetaTensor& x,
                             const MetaTensor& src_index,
                             const MetaTensor& dst_index,
@@ -260,8 +363,8 @@ void GraphSendRecvInferMeta(const MetaTensor& x,
 }
 
 void LayerNormInferMeta(const MetaTensor& x,
-                        paddle::optional<const MetaTensor&> scale,
-                        paddle::optional<const MetaTensor&> bias,
+                        const MetaTensor& scale,
+                        const MetaTensor& bias,
                         float epsilon,
                         int begin_norm_axis,
                         bool is_test,
@@ -283,19 +386,19 @@ void LayerNormInferMeta(const MetaTensor& x,
   auto matrix_dim = phi::flatten_to_2d(x_dim, begin_norm_axis);
   int left = static_cast<int>(matrix_dim[0]);
   int right = static_cast<int>(matrix_dim[1]);
-  if (scale.get_ptr() != nullptr) {
-    PADDLE_ENFORCE_EQ(scale->dims().size(),
+  if (scale) {
+    PADDLE_ENFORCE_EQ(scale.dims().size(),
                       1,
                       phi::errors::InvalidArgument(
                           "The dimensions of Input(Scale) must be 1, but "
                           "received dimensions of"
                           "Input(Scale) is [%d]",
-                          scale->dims().size()));
+                          scale.dims().size()));
   }
 
-  if (config.is_runtime && scale.get_ptr() != nullptr) {
+  if (config.is_runtime && scale) {
     PADDLE_ENFORCE_EQ(
-        scale->dims()[0],
+        scale.dims()[0],
         right,
         phi::errors::InvalidArgument(
             "The first dimension value of Input(Scale) must equal to be the"
@@ -303,21 +406,21 @@ void LayerNormInferMeta(const MetaTensor& x,
             "But received the first dimension value of Input(Scale) is"
             "[%d], the second dimension value of the flattened 2D matrix of"
             " Input(Scale) is [%d].",
-            scale->dims()[0],
+            scale.dims()[0],
             right));
   }
-  if (bias.get_ptr() != nullptr) {
-    PADDLE_ENFORCE_EQ(bias->dims().size(),
+  if (bias) {
+    PADDLE_ENFORCE_EQ(bias.dims().size(),
                       1,
                       phi::errors::InvalidArgument(
                           "The dimensions of Input(Bias) must be 1, but "
                           "received dimensions of"
                           "Input(Bias) is [%d]",
-                          bias->dims().size()));
+                          bias.dims().size()));
   }
-  if (config.is_runtime && bias.get_ptr() != nullptr) {
+  if (config.is_runtime && bias) {
     PADDLE_ENFORCE_EQ(
-        bias->dims()[0],
+        bias.dims()[0],
         right,
         phi::errors::InvalidArgument(
             "The first dimension value of Input(Bias) must equal to be the"
@@ -325,7 +428,7 @@ void LayerNormInferMeta(const MetaTensor& x,
             "But received the first dimension value of Input(Bias) is"
             "[%d], the second dimension value of the flattened 2D matrix of"
             " Input(Bias) is [%d].",
-            bias->dims()[0],
+            bias.dims()[0],
             right));
   }
 
@@ -340,19 +443,19 @@ void LayerNormInferMeta(const MetaTensor& x,
 }
 
 void LayerNormGradInferMeta(const MetaTensor& x,
-                            paddle::optional<const MetaTensor&> y,
-                            paddle::optional<const MetaTensor&> z,
+                            const MetaTensor& y,
+                            const MetaTensor& z,
                             MetaTensor* dx,
                             MetaTensor* dy,
                             MetaTensor* dz) {
   if (dx) {
     dx->share_meta(x);
   }
-  if (dy && (y.get_ptr() != nullptr)) {
-    dy->share_meta(*y.get_ptr());
+  if (dy && y) {
+    dy->share_meta(y);
   }
-  if (dz && (z.get_ptr() != nullptr)) {
-    dz->share_meta(*z.get_ptr());
+  if (dz && z) {
+    dz->share_meta(z);
   }
 }
 
@@ -412,7 +515,7 @@ void LinspaceInferMeta(const MetaTensor& start,
 
 void NllLossRawInferMeta(const MetaTensor& input,
                          const MetaTensor& label,
-                         paddle::optional<const MetaTensor&> weight,
+                         const MetaTensor& weight,
                          int64_t ignore_index,
                          const std::string& reduction,
                          MetaTensor* out,
@@ -437,8 +540,8 @@ void NllLossRawInferMeta(const MetaTensor& input,
             " batch_size is [%s].",
             x_dims[0],
             label_dims[0]));
-    if (weight.get_ptr() != nullptr) {
-      auto w_dims = weight->dims();
+    if (weight) {
+      auto w_dims = weight.dims();
       PADDLE_ENFORCE_EQ(
           w_dims.size(),
           1,
@@ -502,7 +605,7 @@ void PutAlongAxisInferMeta(const MetaTensor& x,
 
 void RoiAlignInferMeta(const MetaTensor& x,
                        const MetaTensor& boxes,
-                       paddle::optional<const MetaTensor&> boxes_num,
+                       const MetaTensor& boxes_num,
                        int pooled_height,
                        int pooled_width,
                        float spatial_scale,
@@ -514,7 +617,7 @@ void RoiAlignInferMeta(const MetaTensor& x,
   auto boxes_dims = boxes.dims();
 
   if (boxes_num) {
-    auto boxes_num_dims = boxes_num->dims();
+    auto boxes_num_dims = boxes_num.dims();
     PADDLE_ENFORCE_EQ(
         boxes_num_dims.size(),
         1,
@@ -579,7 +682,7 @@ void RoiAlignInferMeta(const MetaTensor& x,
 
 void RoiPoolInferMeta(const MetaTensor& x,
                       const MetaTensor& boxes,
-                      paddle::optional<const MetaTensor&> boxes_num,
+                      const MetaTensor& boxes_num,
                       int pooled_height,
                       int pooled_width,
                       float spatial_scale,
@@ -589,7 +692,7 @@ void RoiPoolInferMeta(const MetaTensor& x,
   auto boxes_dims = boxes.dims();
 
   if (boxes_num) {
-    auto boxes_num_dims = boxes_num->dims();
+    auto boxes_num_dims = boxes_num.dims();
     PADDLE_ENFORCE_EQ(
         boxes_num_dims.size(),
         1,
diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h
index 4f561e0adf19d..760011ad829fc 100644
--- a/paddle/phi/infermeta/ternary.h
+++ b/paddle/phi/infermeta/ternary.h
@@ -52,6 +52,15 @@ void ArangeInferMeta(const MetaTensor& start,
                      const MetaTensor& step,
                      MetaTensor* out);
 
+void InstanceNormInferMeta(const MetaTensor& x,
+                           const MetaTensor& scale,
+                           const MetaTensor& bias,
+                           float epsilon,
+                           MetaTensor* y,
+                           MetaTensor* saved_mean,
+                           MetaTensor* saved_variance,
+                           MetaConfig config = MetaConfig());
+
 void GraphSendRecvInferMeta(const MetaTensor& x,
                             const MetaTensor& src_index,
                             const MetaTensor& dst_index,
@@ -61,8 +70,8 @@ void GraphSendRecvInferMeta(const MetaTensor& x,
                             MetaTensor* dst_count);
 
 void LayerNormInferMeta(const MetaTensor& x,
-                        paddle::optional<const MetaTensor&> scale,
-                        paddle::optional<const MetaTensor&> bias,
+                        const MetaTensor& scale,
+                        const MetaTensor& bias,
                         float epsilon,
                         int begin_norm_axis,
                         bool is_test,
@@ -72,8 +81,8 @@ void LayerNormInferMeta(const MetaTensor& x,
                         MetaConfig config = MetaConfig());
 
 void LayerNormGradInferMeta(const MetaTensor& x,
-                            paddle::optional<const MetaTensor&> y,
-                            paddle::optional<const MetaTensor&> z,
+                            const MetaTensor& y,
+                            const MetaTensor& z,
                             MetaTensor* dx,
                             MetaTensor* dy,
                             MetaTensor* dz);
@@ -96,7 +105,7 @@ void LinspaceInferMeta(const MetaTensor& start,
 
 void NllLossRawInferMeta(const MetaTensor& input,
                          const MetaTensor& label,
-                         paddle::optional<const MetaTensor&> weight,
+                         const MetaTensor& weight,
                          int64_t ignore_index,
                          const std::string& reduction,
                          MetaTensor* out,
@@ -112,7 +121,7 @@ void PutAlongAxisInferMeta(const MetaTensor& x,
 
 void RoiAlignInferMeta(const MetaTensor& x,
                        const MetaTensor& boxes,
-                       paddle::optional<const MetaTensor&> boxes_num,
+                       const MetaTensor& boxes_num,
                        int pooled_height,
                        int pooled_width,
                        float spatial_scale,
@@ -123,7 +132,7 @@ void RoiAlignInferMeta(const MetaTensor& x,
 
 void RoiPoolInferMeta(const MetaTensor& x,
                       const MetaTensor& boxes,
-                      paddle::optional<const MetaTensor&> boxes_num,
+                      const MetaTensor& boxes_num,
                       int pooled_height,
                       int pooled_width,
                       float spatial_scale,
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index c88c2d6f60f10..0beb7223f212a 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -401,7 +401,8 @@ void EighInferMeta(const MetaTensor& x,
 
 void EinsumInferMeta(const std::vector<const MetaTensor*>& inputs,
                      const std::string& equation,
-                     MetaTensor* out) {
+                     MetaTensor* out,
+                     std::vector<MetaTensor*> inner_cache) {
   // collect the following informations to prepare einsum.
   LabelMap labelshape(0);
   LabelMap labeltype(LabelType::Reduction);
@@ -1245,6 +1246,65 @@ void MultinomialInferMeta(const MetaTensor& x,
   out->set_dtype(DataType::INT64);
 }
 
+void NanmedianInferMeta(const MetaTensor& x,
+                        const IntArray& axes,
+                        bool keep_dim,
+                        MetaTensor* out,
+                        MetaTensor* median_index) {
+  std::vector<int64_t> axis_list = axes.GetData();
+  auto x_dim = x.dims();
+  int64_t x_rank = x_dim.size();
+  out->set_dtype(x.dtype());
+  median_index->set_dtype(DataType::INT64);
+  median_index->set_dims(make_ddim({x.numel() * 2}));
+
+  std::vector<int32_t> out_dim;
+  if (axis_list.empty()) {
+    if (keep_dim) {
+      for (int64_t i = 0; i < x_rank; i++) {
+        out_dim.push_back(1);
+      }
+    } else {
+      out_dim.push_back(1);
+    }
+  } else {
+    std::vector<int64_t> cleaned_axis;
+    for (auto& axis : axis_list) {
+      if (axis < 0) axis += x_rank;
+
+      PADDLE_ENFORCE_LT(
+          axis,
+          x_rank,
+          errors::InvalidArgument(
+              "Attr(axis) value should be in range [-R, R-1], R is "
+              "the rank of Input(X). But received axis: %d, R: %d. "
+              "Current Input(X)'s shape is=[%s].",
+              axis,
+              x_rank,
+              x_dim));
+
+      PADDLE_ENFORCE_EQ(
+          std::find(cleaned_axis.begin(), cleaned_axis.end(), axis),
+          cleaned_axis.end(),
+          errors::InvalidArgument("Attr(axes) has duplicated elements: %d.",
+                                  static_cast<int>(axis)));
+
+      cleaned_axis.push_back(axis);
+    }
+
+    for (int64_t i = 0; i < x_rank; i++) {
+      if (std::find(cleaned_axis.begin(), cleaned_axis.end(), i) ==
+          cleaned_axis.end()) {
+        out_dim.push_back(x_dim[i]);
+      } else if (keep_dim) {
+        out_dim.push_back(1);
+      }
+    }
+  }
+
+  out->set_dims(make_ddim(out_dim));
+}
+
 void NormInferMeta(const MetaTensor& x,
                    int axis,
                    float epsilon,
@@ -1917,6 +1977,55 @@ void RollInferMeta(const MetaTensor& x,
   out->set_dtype(x.dtype());
 }
 
+void RReluInferMeta(const MetaTensor& x,
+                    float lower,
+                    float upper,
+                    bool is_test,
+                    MetaTensor* out,
+                    MetaTensor* noise) {
+  auto x_dims = x.dims();
+  PADDLE_ENFORCE_GE(lower,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The lower value should be greater than or equal to 0. "
+                        "But received lower value = %f.",
+                        lower));
+  PADDLE_ENFORCE_LE(upper,
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The upper value should be less than or equal to 1. "
+                        "But received upper value = %f.",
+                        upper));
+  PADDLE_ENFORCE_GE(
+      upper,
+      lower,
+      phi::errors::InvalidArgument(
+          "The upper value should be greater than or equal to lower value "
+          "But received upper value = %f, lower value = %f.",
+          upper,
+          lower));
+
+  out->set_dims(x_dims);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+  out->share_lod(x);
+
+  if (noise != nullptr) {
+    noise->set_dims(x_dims);
+    noise->set_dtype(x.dtype());
+    noise->set_layout(x.layout());
+  }
+}
+
+void RReluGradInferMeta(const MetaTensor& out_grad,
+                        const MetaTensor& noise,
+                        MetaTensor* x_grad) {
+  auto do_dims = out_grad.dims();
+  x_grad->set_dims(do_dims);
+  x_grad->set_dtype(out_grad.dtype());
+  x_grad->share_lod(out_grad);
+}
+
 void SetValueInferMeta(const MetaTensor& x, MetaTensor* out) {
   auto in_dims = x.dims();
   PADDLE_ENFORCE_LT(
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index 58b256dc66ee2..a288b9371016f 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -82,7 +82,8 @@ void EighInferMeta(const MetaTensor& x,
 
 void EinsumInferMeta(const std::vector<const MetaTensor*>& inputs,
                      const std::string& equation,
-                     MetaTensor* out);
+                     MetaTensor* out,
+                     std::vector<MetaTensor*> inner_cache);
 
 void ExpandInferMeta(const MetaTensor& x,
                      const IntArray& shape,
@@ -177,6 +178,13 @@ void MultinomialInferMeta(const MetaTensor& x,
                           int num_samples,
                           bool replacement,
                           MetaTensor* out);
+
+void NanmedianInferMeta(const MetaTensor& x,
+                        const IntArray& axes,
+                        bool keep_dim,
+                        MetaTensor* out,
+                        MetaTensor* median_index);
+
 void NormInferMeta(const MetaTensor& x,
                    int axis,
                    float epsilon,
@@ -273,6 +281,17 @@ void RollInferMeta(const MetaTensor& x,
                    const std::vector<int64_t>& axis,
                    MetaTensor* out);
 
+void RReluInferMeta(const MetaTensor& x,
+                    float lower,
+                    float upper,
+                    bool is_test,
+                    MetaTensor* out,
+                    MetaTensor* noise);
+
+void RReluGradInferMeta(const MetaTensor& out_grad,
+                        const MetaTensor& noise,
+                        MetaTensor* x_grad);
+
 void SetValueInferMeta(const MetaTensor& x, MetaTensor* out);
 
 void ShapeInferMeta(const MetaTensor& input, MetaTensor* out);
diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h
index 5d7af6cca947a..8e63a0fd22ade 100644
--- a/paddle/phi/kernels/activation_grad_kernel.h
+++ b/paddle/phi/kernels/activation_grad_kernel.h
@@ -137,7 +137,7 @@ void SigmoidTripleGradKernel(const Context& dev_ctx,
                              const DenseTensor& dout,
                              const DenseTensor& ddx,
                              const DenseTensor& d_dout_new,
-                             paddle::optional<const DenseTensor&> d_ddout,
+                             const paddle::optional<DenseTensor>& d_ddout,
                              DenseTensor* d_out_new,
                              DenseTensor* d_dout,
                              DenseTensor* d_ddx);
diff --git a/paddle/phi/kernels/activation_kernel.h b/paddle/phi/kernels/activation_kernel.h
index a14f732b6c8b6..b719ceddc5563 100644
--- a/paddle/phi/kernels/activation_kernel.h
+++ b/paddle/phi/kernels/activation_kernel.h
@@ -78,7 +78,7 @@ DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(SoftShrink, lambda)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(HardShrink, threshold)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Elu, alpha)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Swish, beta)
-DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(celu, alpha)
+DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Celu, alpha)
 
 DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(BRelu, t_min, t_max)
 DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(STanh, scale_a, scale_b)
diff --git a/paddle/phi/kernels/adam_kernel.h b/paddle/phi/kernels/adam_kernel.h
index f144d40d2b666..0bdf05f8e5123 100644
--- a/paddle/phi/kernels/adam_kernel.h
+++ b/paddle/phi/kernels/adam_kernel.h
@@ -28,8 +28,8 @@ void AdamDenseKernel(const Context& dev_ctx,
                      const DenseTensor& moment2,
                      const DenseTensor& beta1_pow,
                      const DenseTensor& beta2_pow,
-                     paddle::optional<const DenseTensor&> master_param,
-                     paddle::optional<const DenseTensor&> skip_update,
+                     const paddle::optional<DenseTensor>& master_param,
+                     const paddle::optional<DenseTensor>& skip_update,
                      const Scalar& beta1,
                      const Scalar& beta2,
                      const Scalar& epsilon,
diff --git a/paddle/phi/kernels/adamw_kernel.h b/paddle/phi/kernels/adamw_kernel.h
index d7b072adda4a2..5cbb38143ff6f 100644
--- a/paddle/phi/kernels/adamw_kernel.h
+++ b/paddle/phi/kernels/adamw_kernel.h
@@ -28,8 +28,8 @@ void AdamwDenseKernel(const Context& dev_ctx,
                       const DenseTensor& moment2,
                       const DenseTensor& beta1_pow,
                       const DenseTensor& beta2_pow,
-                      paddle::optional<const DenseTensor&> master_param,
-                      paddle::optional<const DenseTensor&> skip_update,
+                      const paddle::optional<DenseTensor>& master_param,
+                      const paddle::optional<DenseTensor>& skip_update,
                       const Scalar& beta1,
                       const Scalar& beta2,
                       const Scalar& epsilon,
diff --git a/paddle/phi/kernels/assign_kernel.cc b/paddle/phi/kernels/assign_kernel.cc
index 720ebb5b78c9a..2349bf990acd3 100644
--- a/paddle/phi/kernels/assign_kernel.cc
+++ b/paddle/phi/kernels/assign_kernel.cc
@@ -26,12 +26,12 @@ template <typename Context>
 void AssignKernel(const Context& dev_ctx,
                   const DenseTensor& x,
                   DenseTensor* out) {
-  Copy<Context>(dev_ctx, x, x.place(), false, out);
+  paddle::framework::TensorCopy(x, x.place(), out);
 }
 
 template <typename Context>
 void AssignRawKernel(const Context& dev_ctx,
-                     paddle::optional<const DenseTensor&> x,
+                     const paddle::optional<DenseTensor>& x,
                      DenseTensor* out) {
   if (x) {
     if (!x->IsInitialized()) {
diff --git a/paddle/phi/kernels/assign_kernel.h b/paddle/phi/kernels/assign_kernel.h
index 6881ac9f0ee22..0294dc950deb1 100644
--- a/paddle/phi/kernels/assign_kernel.h
+++ b/paddle/phi/kernels/assign_kernel.h
@@ -31,7 +31,7 @@ void AssignKernel(const Context& dev_ctx,
 // this looks weird
 template <typename Context>
 void AssignRawKernel(const Context& dev_ctx,
-                     paddle::optional<const DenseTensor&> x,
+                     const paddle::optional<DenseTensor>& x,
                      DenseTensor* out);
 
 template <typename Context>
diff --git a/paddle/phi/kernels/batch_norm_grad_kernel.h b/paddle/phi/kernels/batch_norm_grad_kernel.h
index 2cb3b16a022b1..3de2f69f452db 100644
--- a/paddle/phi/kernels/batch_norm_grad_kernel.h
+++ b/paddle/phi/kernels/batch_norm_grad_kernel.h
@@ -24,11 +24,11 @@ void BatchNormGradRawKernel(const Context& dev_ctx,
                             const DenseTensor& x,
                             const DenseTensor& scale,
                             const DenseTensor& bias,
-                            paddle::optional<const DenseTensor&> mean,
-                            paddle::optional<const DenseTensor&> variance,
+                            const paddle::optional<DenseTensor>& mean,
+                            const paddle::optional<DenseTensor>& variance,
                             const DenseTensor& saved_mean,
                             const DenseTensor& saved_variance,
-                            paddle::optional<const DenseTensor&> reserve_space,
+                            const paddle::optional<DenseTensor>& reserve_space,
                             const DenseTensor& y_grad,
                             float momentum,
                             float epsilon,
@@ -47,11 +47,11 @@ void BatchNormGradKernel(const Context& dev_ctx,
                          const DenseTensor& x,
                          const DenseTensor& scale,
                          const DenseTensor& bias,
-                         paddle::optional<const DenseTensor&> mean,
-                         paddle::optional<const DenseTensor&> variance,
+                         const paddle::optional<DenseTensor>& mean,
+                         const paddle::optional<DenseTensor>& variance,
                          const DenseTensor& saved_mean,
                          const DenseTensor& saved_variance,
-                         paddle::optional<const DenseTensor&> reserve_space,
+                         const paddle::optional<DenseTensor>& reserve_space,
                          const DenseTensor& y_grad,
                          float momentum,
                          float epsilon,
@@ -68,8 +68,8 @@ template <typename T, typename Context>
 void BatchNormDoubleGradKernel(const Context& dev_ctx,
                                const DenseTensor& x,
                                const DenseTensor& scale,
-                               paddle::optional<const DenseTensor&> mean,
-                               paddle::optional<const DenseTensor&> variance,
+                               const paddle::optional<DenseTensor>& mean,
+                               const paddle::optional<DenseTensor>& variance,
                                const DenseTensor& saved_mean,
                                const DenseTensor& saved_variance,
                                const DenseTensor& y_grad,
diff --git a/paddle/phi/kernels/bilinear_tensor_product_kernel.h b/paddle/phi/kernels/bilinear_tensor_product_kernel.h
index b34e8946ddd58..bd01ed94868e2 100644
--- a/paddle/phi/kernels/bilinear_tensor_product_kernel.h
+++ b/paddle/phi/kernels/bilinear_tensor_product_kernel.h
@@ -24,7 +24,7 @@ void BilinearTensorProductKernel(const Context& dev_ctx,
                                  const DenseTensor& x,
                                  const DenseTensor& y,
                                  const DenseTensor& weight,
-                                 paddle::optional<const DenseTensor&> bias,
+                                 const paddle::optional<DenseTensor>& bias,
                                  DenseTensor* out);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/bincount_kernel.h b/paddle/phi/kernels/bincount_kernel.h
index 3ba69d365480f..e110b6e014b4d 100644
--- a/paddle/phi/kernels/bincount_kernel.h
+++ b/paddle/phi/kernels/bincount_kernel.h
@@ -21,7 +21,7 @@ namespace phi {
 template <typename T, typename Context>
 void BincountKernel(const Context& dev_ctx,
                     const DenseTensor& x,
-                    const paddle::optional<const DenseTensor&> weights,
+                    const paddle::optional<DenseTensor>& weights,
                     int minlength,
                     DenseTensor* out);
 
diff --git a/paddle/phi/kernels/conv_grad_grad_kernel.h b/paddle/phi/kernels/conv_grad_grad_kernel.h
index 0a359d778a681..f25cbe384c213 100644
--- a/paddle/phi/kernels/conv_grad_grad_kernel.h
+++ b/paddle/phi/kernels/conv_grad_grad_kernel.h
@@ -23,8 +23,8 @@ void ConvGradGradKernel(const Context& dev_ctx,
                         const DenseTensor& input,
                         const DenseTensor& filter,
                         const DenseTensor& out_grad,
-                        paddle::optional<const DenseTensor&> input_grad_grad,
-                        paddle::optional<const DenseTensor&> filter_grad_grad,
+                        const paddle::optional<DenseTensor>& input_grad_grad,
+                        const paddle::optional<DenseTensor>& filter_grad_grad,
                         const std::vector<int>& strides,
                         const std::vector<int>& paddings,
                         const std::string& paddding_algorithm,
@@ -40,11 +40,11 @@ void ConvGradGradKernel(const Context& dev_ctx,
 
 template <typename T, typename Context>
 void Conv3DGradGradKernel(const Context& dev_ctx,
-                          paddle::optional<const DenseTensor&> input_grad_grad,
-                          paddle::optional<const DenseTensor&> filter_grad_grad,
-                          const DenseTensor& out_grad,
                           const DenseTensor& input,
                           const DenseTensor& filter,
+                          const DenseTensor& out_grad,
+                          const paddle::optional<DenseTensor>& input_grad_grad,
+                          const paddle::optional<DenseTensor>& filter_grad_grad,
                           const std::vector<int>& strides,
                           const std::vector<int>& paddings,
                           const std::string& paddding_algorithm,
@@ -54,8 +54,8 @@ void Conv3DGradGradKernel(const Context& dev_ctx,
                           bool use_addto,
                           int workspace_size_MB,
                           bool exhaustive_search,
-                          DenseTensor* out_grad_grad,
                           DenseTensor* input_grad,
-                          DenseTensor* filter_grad);
+                          DenseTensor* filter_grad,
+                          DenseTensor* out_grad_grad);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/adam_kernel.cc b/paddle/phi/kernels/cpu/adam_kernel.cc
index 1e0f5c4df9fd6..339d690310f45 100644
--- a/paddle/phi/kernels/cpu/adam_kernel.cc
+++ b/paddle/phi/kernels/cpu/adam_kernel.cc
@@ -36,8 +36,8 @@ void AdamDenseKernel(const Context& dev_ctx,
                      const DenseTensor& moment2,
                      const DenseTensor& beta1_pow,
                      const DenseTensor& beta2_pow,
-                     paddle::optional<const DenseTensor&> master_param,
-                     paddle::optional<const DenseTensor&> skip_update,
+                     const paddle::optional<DenseTensor>& master_param,
+                     const paddle::optional<DenseTensor>& skip_update,
                      const Scalar& beta1,
                      const Scalar& beta2,
                      const Scalar& epsilon,
diff --git a/paddle/phi/kernels/cpu/adamw_kernel.cc b/paddle/phi/kernels/cpu/adamw_kernel.cc
index f2c98fded4d4f..93092133291af 100644
--- a/paddle/phi/kernels/cpu/adamw_kernel.cc
+++ b/paddle/phi/kernels/cpu/adamw_kernel.cc
@@ -35,8 +35,8 @@ void AdamwDenseKernel(const Context& dev_ctx,
                       const DenseTensor& moment2,
                       const DenseTensor& beta1_pow,
                       const DenseTensor& beta2_pow,
-                      paddle::optional<const DenseTensor&> master_param,
-                      paddle::optional<const DenseTensor&> skip_update,
+                      const paddle::optional<DenseTensor>& master_param,
+                      const paddle::optional<DenseTensor>& skip_update,
                       const Scalar& beta1,
                       const Scalar& beta2,
                       const Scalar& epsilon,
diff --git a/paddle/phi/kernels/cpu/allclose_kernel.cc b/paddle/phi/kernels/cpu/allclose_kernel.cc
index 80dea561956cf..f95ddc5621e9a 100644
--- a/paddle/phi/kernels/cpu/allclose_kernel.cc
+++ b/paddle/phi/kernels/cpu/allclose_kernel.cc
@@ -15,7 +15,7 @@
 #include "paddle/phi/kernels/allclose_kernel.h"
 
 #include <cmath>
-
+#include "glog/logging.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 
diff --git a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
index bf01c24f4ffa3..366a08e59fee3 100644
--- a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
@@ -41,11 +41,11 @@ void BatchNormGradRawKernel(const Context& ctx,
                             const DenseTensor& x,
                             const DenseTensor& scale,
                             const DenseTensor& bias,
-                            paddle::optional<const DenseTensor&> mean,
-                            paddle::optional<const DenseTensor&> variance,
+                            const paddle::optional<DenseTensor>& mean,
+                            const paddle::optional<DenseTensor>& variance,
                             const DenseTensor& saved_mean,
                             const DenseTensor& saved_variance,
-                            paddle::optional<const DenseTensor&> reserve_space,
+                            const paddle::optional<DenseTensor>& reserve_space,
                             const DenseTensor& y_grad,
                             float momentum,
                             float epsilon,
@@ -300,11 +300,11 @@ void BatchNormGradKernel(const Context& dev_ctx,
                          const DenseTensor& x,
                          const DenseTensor& scale,
                          const DenseTensor& bias,
-                         paddle::optional<const DenseTensor&> mean,
-                         paddle::optional<const DenseTensor&> variance,
+                         const paddle::optional<DenseTensor>& mean,
+                         const paddle::optional<DenseTensor>& variance,
                          const DenseTensor& saved_mean,
                          const DenseTensor& saved_variance,
-                         paddle::optional<const DenseTensor&> reserve_space,
+                         const paddle::optional<DenseTensor>& reserve_space,
                          const DenseTensor& y_grad,
                          float momentum,
                          float epsilon,
@@ -343,8 +343,8 @@ template <typename T, typename Context>
 void BatchNormDoubleGradKernel(const Context& ctx,
                                const DenseTensor& x,
                                const DenseTensor& scale,
-                               paddle::optional<const DenseTensor&> mean,
-                               paddle::optional<const DenseTensor&> variance,
+                               const paddle::optional<DenseTensor>& mean,
+                               const paddle::optional<DenseTensor>& variance,
                                const DenseTensor& saved_mean,
                                const DenseTensor& saved_variance,
                                const DenseTensor& y_grad,
diff --git a/paddle/phi/kernels/cpu/bincount_kernel.cc b/paddle/phi/kernels/cpu/bincount_kernel.cc
index c9dc44c1e04eb..8163953c1e00e 100644
--- a/paddle/phi/kernels/cpu/bincount_kernel.cc
+++ b/paddle/phi/kernels/cpu/bincount_kernel.cc
@@ -23,7 +23,7 @@ namespace phi {
 template <typename Context, typename T, typename InputT>
 void BincountInner(const Context& dev_ctx,
                    const DenseTensor& x,
-                   const paddle::optional<const DenseTensor&> weights,
+                   const paddle::optional<DenseTensor>& weights,
                    int minlength,
                    DenseTensor* out) {
   const DenseTensor* input = &x;
@@ -85,7 +85,7 @@ void BincountInner(const Context& dev_ctx,
 template <typename T, typename Context>
 void BincountKernel(const Context& dev_ctx,
                     const DenseTensor& x,
-                    const paddle::optional<const DenseTensor&> weights,
+                    const paddle::optional<DenseTensor>& weights,
                     int minlength,
                     DenseTensor* out) {
   if (x.dtype() == DataType::INT32) {
diff --git a/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc
index 4966c998dd37d..4538ccf9433f9 100644
--- a/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc
@@ -21,11 +21,11 @@
 namespace phi {
 template <typename T, typename Context>
 void Conv3DGradGradKernel(const Context& ctx,
-                          paddle::optional<const DenseTensor&> input_grad_grad,
-                          paddle::optional<const DenseTensor&> filter_grad_grad,
-                          const DenseTensor& out_grad,
                           const DenseTensor& input,
                           const DenseTensor& filter,
+                          const DenseTensor& out_grad,
+                          const paddle::optional<DenseTensor>& input_grad_grad,
+                          const paddle::optional<DenseTensor>& filter_grad_grad,
                           const std::vector<int>& strides,
                           const std::vector<int>& paddings_t,
                           const std::string& padding_algorithm,
@@ -35,9 +35,9 @@ void Conv3DGradGradKernel(const Context& ctx,
                           bool use_addto,
                           int workspace_size_MB,
                           bool exhaustive_search_t,
-                          DenseTensor* out_grad_grad,
                           DenseTensor* input_grad,
-                          DenseTensor* filter_grad) {
+                          DenseTensor* filter_grad,
+                          DenseTensor* out_grad_grad) {
   ConvGradGradKernel<T>(ctx,
                         input,
                         filter,
diff --git a/paddle/phi/kernels/cpu/dropout_kernel.cc b/paddle/phi/kernels/cpu/dropout_kernel.cc
index c00aedef8c67d..fa12e505e4209 100644
--- a/paddle/phi/kernels/cpu/dropout_kernel.cc
+++ b/paddle/phi/kernels/cpu/dropout_kernel.cc
@@ -23,7 +23,7 @@ namespace phi {
 template <typename T, typename Context>
 void DropoutRawKernel(const Context& dev_ctx,
                       const DenseTensor& x,
-                      paddle::optional<const DenseTensor&> seed_tensor,
+                      const paddle::optional<DenseTensor>& seed_tensor,
                       float p,
                       bool is_test,
                       const std::string& mode,
diff --git a/paddle/phi/kernels/cpu/einsum_kernel.cc b/paddle/phi/kernels/cpu/einsum_kernel.cc
index 3e25a65526d89..8968542b3e0b8 100644
--- a/paddle/phi/kernels/cpu/einsum_kernel.cc
+++ b/paddle/phi/kernels/cpu/einsum_kernel.cc
@@ -17,4 +17,5 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/einsum_impl.h"
 
-PD_REGISTER_KERNEL(einsum, CPU, ALL_LAYOUT, phi::EinsumKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    einsum, CPU, ALL_LAYOUT, phi::EinsumKernelRaw, float, double) {}
diff --git a/paddle/phi/kernels/cpu/elementwise_add_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_add_grad_kernel.cc
index f8a89b997b413..434866b840cc3 100644
--- a/paddle/phi/kernels/cpu/elementwise_add_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_add_grad_kernel.cc
@@ -63,8 +63,8 @@ template <typename T, typename Context>
 void AddDoubleGradKernel(const Context& dev_ctx,
                          const DenseTensor& y,
                          const DenseTensor& dout,
-                         paddle::optional<const DenseTensor&> ddx,
-                         paddle::optional<const DenseTensor&> ddy,
+                         const paddle::optional<DenseTensor>& ddx,
+                         const paddle::optional<DenseTensor>& ddy,
                          int axis,
                          DenseTensor* ddout) {
   phi::AddDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
diff --git a/paddle/phi/kernels/cpu/elementwise_add_kernel.cc b/paddle/phi/kernels/cpu/elementwise_add_kernel.cc
index 6070264547249..5019b9f570628 100644
--- a/paddle/phi/kernels/cpu/elementwise_add_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_add_kernel.cc
@@ -34,6 +34,14 @@ void AddKernel(const Context& dev_ctx,
   AddRawKernel<T>(dev_ctx, x, y, axis, out);
 }
 
+template <typename T, typename Context>
+void GradAddKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   DenseTensor* out) {
+  AddKernel<T>(dev_ctx, x, y, out);
+}
+
 }  // namespace phi
 
 using complex64 = ::phi::dtype::complex<float>;
@@ -65,3 +73,15 @@ PD_REGISTER_KERNEL(add,
                    int64_t,
                    complex64,
                    complex128) {}
+
+PD_REGISTER_KERNEL(grad_add,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GradAddKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
diff --git a/paddle/phi/kernels/cpu/elementwise_subtract_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_subtract_grad_kernel.cc
index b86ead04dbc5f..03bb47aaa97b3 100644
--- a/paddle/phi/kernels/cpu/elementwise_subtract_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_subtract_grad_kernel.cc
@@ -39,8 +39,8 @@ template <typename T, typename Context>
 void SubtractDoubleGradKernel(const Context& dev_ctx,
                               const DenseTensor& y,
                               const DenseTensor& dout,
-                              paddle::optional<const DenseTensor&> ddx,
-                              paddle::optional<const DenseTensor&> ddy,
+                              const paddle::optional<DenseTensor>& ddx,
+                              const paddle::optional<DenseTensor>& ddy,
                               int axis,
                               DenseTensor* ddout) {
   phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
diff --git a/paddle/phi/kernels/cpu/graph_reindex_kernel.cc b/paddle/phi/kernels/cpu/graph_reindex_kernel.cc
index d6454b4796430..c0a88f3222717 100644
--- a/paddle/phi/kernels/cpu/graph_reindex_kernel.cc
+++ b/paddle/phi/kernels/cpu/graph_reindex_kernel.cc
@@ -27,8 +27,8 @@ void GraphReindexKernel(const Context& dev_ctx,
                         const DenseTensor& x,
                         const DenseTensor& neighbors,
                         const DenseTensor& count,
-                        paddle::optional<const DenseTensor&> hashtable_value,
-                        paddle::optional<const DenseTensor&> hashtable_index,
+                        const paddle::optional<DenseTensor>& hashtable_value,
+                        const paddle::optional<DenseTensor>& hashtable_index,
                         bool flag_buffer_hashtable,
                         DenseTensor* reindex_src,
                         DenseTensor* reindex_dst,
diff --git a/paddle/phi/kernels/cpu/graph_sample_neighbors_kernel.cc b/paddle/phi/kernels/cpu/graph_sample_neighbors_kernel.cc
index b4321a85ab2ee..70aac053417b8 100644
--- a/paddle/phi/kernels/cpu/graph_sample_neighbors_kernel.cc
+++ b/paddle/phi/kernels/cpu/graph_sample_neighbors_kernel.cc
@@ -167,8 +167,8 @@ void GraphSampleNeighborsKernel(
     const DenseTensor& row,
     const DenseTensor& col_ptr,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> eids,
-    paddle::optional<const DenseTensor&> perm_buffer,
+    const paddle::optional<DenseTensor>& eids,
+    const paddle::optional<DenseTensor>& perm_buffer,
     int sample_size,
     bool return_eids,
     bool flag_perm_buffer,
diff --git a/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc b/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc
index 95eeb64afea20..6ea65d005c1ad 100644
--- a/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc
@@ -121,8 +121,8 @@ void GraphSendRecvGradKernel(const Context& ctx,
                              const DenseTensor& x,
                              const DenseTensor& src_index,
                              const DenseTensor& dst_index,
-                             paddle::optional<const DenseTensor&> out,
-                             paddle::optional<const DenseTensor&> dst_count,
+                             const paddle::optional<DenseTensor>& out,
+                             const paddle::optional<DenseTensor>& dst_count,
                              const DenseTensor& out_grad,
                              const std::string& pool_type,
                              DenseTensor* x_grad) {
diff --git a/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h b/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h
index cc67f8e7f210c..9b38095f25f75 100644
--- a/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h
+++ b/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h
@@ -31,9 +31,9 @@ void HierarchicalSigmoidGradKernelImpl(
     const DenseTensor& x,
     const DenseTensor& w,
     const DenseTensor& label,
-    paddle::optional<const DenseTensor&> path,
-    paddle::optional<const DenseTensor&> code,
-    paddle::optional<const DenseTensor&> bias,
+    const paddle::optional<DenseTensor>& path,
+    const paddle::optional<DenseTensor>& code,
+    const paddle::optional<DenseTensor>& bias,
     const DenseTensor& pre_out,
     const DenseTensor& out_grad,
     int num_classes,
diff --git a/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad_kernel.cc b/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad_kernel.cc
index 9edc9f87d4b1f..eee4525293f3f 100644
--- a/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad_kernel.cc
@@ -25,9 +25,9 @@ void HierarchicalSigmoidGradKernel(const Context& ctx,
                                    const DenseTensor& x,
                                    const DenseTensor& w,
                                    const DenseTensor& label,
-                                   paddle::optional<const DenseTensor&> path,
-                                   paddle::optional<const DenseTensor&> code,
-                                   paddle::optional<const DenseTensor&> bias,
+                                   const paddle::optional<DenseTensor>& path,
+                                   const paddle::optional<DenseTensor>& code,
+                                   const paddle::optional<DenseTensor>& bias,
                                    const DenseTensor& pre_out,
                                    const DenseTensor& out_grad,
                                    int num_classes,
diff --git a/paddle/phi/kernels/cpu/hierarchical_sigmoid_kernel.cc b/paddle/phi/kernels/cpu/hierarchical_sigmoid_kernel.cc
index 4c4f1aa125a33..7c3421e88d449 100644
--- a/paddle/phi/kernels/cpu/hierarchical_sigmoid_kernel.cc
+++ b/paddle/phi/kernels/cpu/hierarchical_sigmoid_kernel.cc
@@ -32,9 +32,9 @@ void HierarchicalSigmoidKernel(const Context& ctx,
                                const DenseTensor& x,
                                const DenseTensor& w,
                                const DenseTensor& label,
-                               paddle::optional<const DenseTensor&> path,
-                               paddle::optional<const DenseTensor&> code,
-                               paddle::optional<const DenseTensor&> bias,
+                               const paddle::optional<DenseTensor>& path,
+                               const paddle::optional<DenseTensor>& code,
+                               const paddle::optional<DenseTensor>& bias,
                                int num_classes,
                                bool remote_prefetch,
                                int trainer_id,
diff --git a/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc
new file mode 100644
index 0000000000000..340d2907a7909
--- /dev/null
+++ b/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc
@@ -0,0 +1,348 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/instance_norm_grad_kernel.h"
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/extensions.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/norm_utils.h"
+
+namespace phi {
+
+template <typename T>
+using ConstEigenArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using ConstEigenVectorArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using EigenArrayMap =
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
+
+template <typename T, typename Context>
+void InstanceNormGradKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& d_y,
+                            const paddle::optional<DenseTensor>& scale,
+                            const DenseTensor& saved_mean,
+                            const DenseTensor& saved_variance,
+                            float epsilon,
+                            DenseTensor* d_x,
+                            DenseTensor* d_scale,
+                            DenseTensor* d_bias) {
+  const auto* scale_ptr = scale.get_ptr();
+
+  const auto& x_dims = x.dims();
+
+  const int N = x_dims[0];
+  const int C = x_dims[1];
+  const int NxC = N * C;
+  const int sample_size = x.numel() / N / C;
+
+  dev_ctx.template Alloc<T>(d_x);
+  auto* place = dev_ctx.eigen_device();
+
+  Eigen::DSizes<int, 2> rshape(NxC, sample_size);
+  Eigen::DSizes<int, 2> param_shape(N, C);
+  Eigen::DSizes<int, 2> shape(NxC, sample_size);
+#ifndef EIGEN_HAS_INDEX_LIST
+  Eigen::DSizes<int, 1> rdims(0);
+  Eigen::DSizes<int, 1> mean_rdims(1);
+  Eigen::DSizes<int, 2> bcast(1, sample_size);
+  Eigen::DSizes<int, 2> C_shape(C, 1);
+  Eigen::DSizes<int, 2> NxC_shape(NxC, 1);
+#else
+  Eigen::IndexList<Eigen::type2index<0>> rdims;
+  Eigen::IndexList<Eigen::type2index<1>> mean_rdims;
+  Eigen::IndexList<Eigen::type2index<1>, int> bcast;
+  bcast.set(1, sample_size);
+  Eigen::IndexList<int, Eigen::type2index<1>> C_shape;
+  C_shape.set(0, C);
+  Eigen::IndexList<int, Eigen::type2index<1>> NxC_shape;
+  NxC_shape.set(0, NxC);
+#endif
+
+  phi::funcs::SetConstant<CPUContext, T> set_constant;
+
+  DenseTensor scale_data;
+  if (!scale_ptr) {
+    scale_data.Resize({C});
+    dev_ctx.template Alloc<T>(&scale_data);
+    set_constant(dev_ctx, &scale_data, static_cast<T>(1));
+  }
+
+  auto scale_e =
+      scale_ptr
+          ? EigenVector<T>::Flatten(*scale_ptr)
+          : EigenVector<T>::Flatten(const_cast<const DenseTensor&>(scale_data));
+  auto mean_e = EigenVector<T>::Flatten(saved_mean);
+  auto inv_var_e = EigenVector<T>::Flatten(saved_variance);
+  auto dy_e = EigenVector<T>::Flatten(d_y);
+  auto x_e = EigenVector<T>::Flatten(x);
+
+  auto scale_arr = scale_e.reshape(C_shape);
+  auto mean_arr = mean_e.reshape(NxC_shape);
+  auto inv_var_arr = inv_var_e.reshape(NxC_shape);
+  auto dy_arr = dy_e.reshape(shape);
+  auto x_arr = x_e.reshape(shape);
+
+  auto tmp = (x_arr - mean_arr.eval().broadcast(bcast)) *
+             inv_var_arr.eval().broadcast(bcast);
+
+  // math: d_bias = np.sum(d_y, axis=(n,h,w))
+  // math: d_scale = np.sum((X-mean) / inv_std * dy, axis=(n, h,w))
+  if (d_scale && d_bias) {
+    dev_ctx.template Alloc<T>(d_scale);
+    dev_ctx.template Alloc<T>(d_bias);
+    set_constant(dev_ctx, d_scale, static_cast<T>(0));
+    set_constant(dev_ctx, d_bias, static_cast<T>(0));
+
+    auto d_scale_e = EigenVector<T>::Flatten(*d_scale);
+    auto d_scale_data = d_scale_e.reshape(C_shape);
+    auto d_bias_e = EigenVector<T>::Flatten(*d_bias);
+    auto d_bias_data = d_bias_e.reshape(C_shape);
+    d_bias_data.device(*place) =
+        dy_arr.sum(mean_rdims).reshape(param_shape).sum(rdims);
+    d_scale_data.device(*place) =
+        (tmp * dy_arr).sum(mean_rdims).reshape(param_shape).sum(rdims);
+  }
+
+  auto dy_mean =
+      dy_arr.mean(mean_rdims).reshape(NxC_shape).eval().broadcast(bcast);
+
+  Eigen::DSizes<int, 2> bcast_param(N, sample_size);
+  set_constant(dev_ctx, d_x, static_cast<T>(0));
+  // math: d_x = scale * inv_var * d_y - scale * inv_var * np.sum(d_y,
+  // axis=(h,w))
+  //             - scale * (X - mean) * inv_var.pow(3) * np.sum(d_y * (X -
+  //             mean),
+  //             axis=(h,w))
+  auto dx_e = EigenVector<T>::Flatten(*d_x);
+  auto dx_arr = dx_e.reshape(shape);
+  dx_arr.device(*place) = scale_arr.broadcast(bcast_param) *
+                          inv_var_arr.broadcast(bcast) *
+                          (dy_arr - dy_mean -
+                           tmp *
+                               (dy_arr * tmp)
+                                   .mean(mean_rdims)
+                                   .reshape(NxC_shape)
+                                   .eval()
+                                   .broadcast(bcast));
+}
+
+template <typename T, typename Context>
+void InstanceNormDoubleGradKernel(const Context& dev_ctx,
+                                  const DenseTensor& x,
+                                  const paddle::optional<DenseTensor>& scale,
+                                  const DenseTensor& saved_mean,
+                                  const DenseTensor& saved_variance,
+                                  const DenseTensor& dy,
+                                  const paddle::optional<DenseTensor>& ddx,
+                                  const paddle::optional<DenseTensor>& ddscale,
+                                  const paddle::optional<DenseTensor>& ddbias,
+                                  float epsilon,
+                                  DenseTensor* dx,
+                                  DenseTensor* dscale,
+                                  DenseTensor* ddy) {
+  const auto* Scale = scale.get_ptr();
+  const auto* ddScale = ddscale.get_ptr();
+  const auto* ddX = ddx.get_ptr();
+  const auto* ddBias = ddbias.get_ptr();
+  phi::funcs::SetConstant<CPUContext, T> set_constant;
+  const auto& x_dims = x.dims();
+  int N, C, H, W, D;
+  funcs::ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
+  const int sample_size = x.numel() / N / C;
+  const int NxC = N * C;
+
+  const T* mean_data = saved_mean.data<T>();
+  const T* inv_var_data = saved_variance.data<T>();
+  DenseTensor mean_tensor;
+  DenseTensor inv_var_tensor;
+  ConstEigenArrayMap<T> x_arr(x.data<T>(), sample_size, NxC);
+  ConstEigenVectorArrayMap<T> mean_arr(mean_data, NxC);
+  ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, NxC);
+
+  DenseTensor mean_tile;
+  mean_tile.Resize({sample_size, NxC});
+  dev_ctx.template Alloc<T>(&mean_tile);
+  EigenArrayMap<T> mean_tile_data(mean_tile.data<T>(), sample_size, NxC);
+  DenseTensor inv_var_tile;
+  inv_var_tile.Resize({sample_size, NxC});
+  dev_ctx.template Alloc<T>(&inv_var_tile);
+  EigenArrayMap<T> inv_var_tile_data(inv_var_tile.data<T>(), sample_size, NxC);
+
+  mean_tile_data = mean_arr.transpose().replicate(sample_size, 1);
+  inv_var_tile_data = inv_var_arr.transpose().replicate(sample_size, 1);
+
+  DenseTensor Scale_data;
+  if (!Scale) {
+    Scale_data.Resize({C});
+    dev_ctx.template Alloc<T>(&Scale_data);
+    set_constant(dev_ctx, &Scale_data, static_cast<T>(1));
+  }
+  ConstEigenVectorArrayMap<T> scale_arr(
+      Scale ? Scale->data<T>() : Scale_data.data<T>(), C);
+
+  DenseTensor scale_tile;
+  scale_tile.Resize({sample_size, NxC});
+  dev_ctx.template Alloc<T>(&scale_tile);
+  EigenArrayMap<T> scale_tile_data(scale_tile.data<T>(), sample_size, NxC);
+  scale_tile_data = scale_arr.transpose().replicate(sample_size, N);
+  ConstEigenArrayMap<T> dy_arr(dy.data<T>(), sample_size, NxC);
+  ConstEigenArrayMap<T> ddx_arr(ddX->data<T>(), sample_size, NxC);
+  // math: dx = scale * ((x - mean) * inv_var / HxW * (np.mean(ddx,
+  //          axis=(h,w)) * np.sum(dy, axis=(h,w)) -
+  //          np.sum(dy * ddx, axis=(h,w)) + 3 * np.mean(dy * (x - mean),
+  //          axis=(h,w)) * inv_var.pow(2) *
+  //          np.sum(ddx * (x - mean), axis=(h,w))) + inv_var.pow(3) / HxW *
+  //          np.sum(ddx * (x - mean)) *
+  //          (np.mean(dy, axis=(h,w)) - dy) + inv_var.pow(3) / HxW *
+  //          np.sum(dy, axis=(h,w)) * (x - mean) *
+  //          (np.mean(ddx, axis=(h,w)) - ddx)) + ddr * (dy * inv_var -
+  //          inv_var * np.mean(dy, axis=(h,w)) - inv_var.pow(3) *
+  //          (x - mean) * np.mean(dy * (x - mean),  axis=(h,w)))
+
+  DenseTensor x_sub_mean_mul_invstd;
+  x_sub_mean_mul_invstd.Resize({sample_size, NxC});
+  dev_ctx.template Alloc<T>(&x_sub_mean_mul_invstd);
+  EigenArrayMap<T> x_sub_mean_mul_invstd_arr(
+      x_sub_mean_mul_invstd.data<T>(), sample_size, NxC);
+  x_sub_mean_mul_invstd_arr = (x_arr - mean_tile_data) * inv_var_tile_data;
+
+  if (dx) {
+    dev_ctx.template Alloc<T>(dx);
+    set_constant(dev_ctx, dx, static_cast<T>(0));
+    EigenArrayMap<T> dx_arr(dx->data<T>(), sample_size, NxC);
+    if (ddX) {
+      dx_arr +=
+          x_sub_mean_mul_invstd_arr * inv_var_tile_data * inv_var_tile_data /
+          sample_size *
+          (ddx_arr.colwise().sum() * dy_arr.colwise().sum() / sample_size -
+           (dy_arr * ddx_arr).colwise().sum() +
+           3. * (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() *
+               (ddx_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
+               sample_size);
+      dx_arr += (ddx_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
+                sample_size * inv_var_tile_data * inv_var_tile_data *
+                (dy_arr.colwise().sum() / sample_size - dy_arr);
+      dx_arr += (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
+                sample_size * inv_var_tile_data * inv_var_tile_data *
+                (ddx_arr.colwise().sum() / sample_size - ddx_arr);
+      dx_arr = scale_tile_data * dx_arr;
+    }
+    if (ddScale) {
+      ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
+      DenseTensor ddscale_tile;
+      ddscale_tile.Resize({sample_size, NxC});
+      dev_ctx.template Alloc<T>(&ddscale_tile);
+      EigenArrayMap<T> ddscale_tile_data(
+          ddscale_tile.data<T>(), sample_size, NxC);
+      ddscale_tile_data = ddscale_arr.transpose().replicate(sample_size, N);
+      dx_arr += (dy_arr * inv_var_tile_data -
+                 dy_arr.colwise().sum() / sample_size * inv_var_tile_data -
+                 x_sub_mean_mul_invstd_arr * inv_var_tile_data *
+                     (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
+                     sample_size) *
+                ddscale_tile_data;
+    }
+  }
+  if (dscale) {
+    // math: dscale = inv_var * (dy - np.mean(dy, axis=(h,w) - (x-mean) *
+    //            inv_var.pow(2) * np.mean(dy * (x-mean), axis=(h,w)))) * ddx
+    dev_ctx.template Alloc<T>(dscale);
+    set_constant(dev_ctx, dscale, static_cast<T>(0));
+    EigenVectorArrayMap<T> dscale_arr(dscale->data<T>(), C);
+    if (ddX) {
+      DenseTensor first_grad;
+      first_grad.Resize({sample_size, NxC});
+      dev_ctx.template Alloc<T>(&first_grad);
+      set_constant(dev_ctx, &first_grad, static_cast<T>(0));
+      EigenArrayMap<T> first_grad_arr(first_grad.data<T>(), sample_size, NxC);
+      first_grad_arr +=
+          inv_var_tile_data *
+          (dy_arr -
+           dy_arr.colwise().sum().replicate(sample_size, 1) / sample_size -
+           x_sub_mean_mul_invstd_arr *
+               (dy_arr * x_sub_mean_mul_invstd_arr)
+                   .colwise()
+                   .sum()
+                   .replicate(sample_size, 1) /
+               sample_size);
+      first_grad_arr = first_grad_arr * ddx_arr;
+      for (int nc = 0; nc < NxC; ++nc) {
+        int c = nc % C;
+        dscale_arr(c) += first_grad_arr.colwise().sum()(nc);
+      }
+    }
+  }
+  if (ddy) {
+    // math: ddy = (x - mean) * inv_var * ddscale + ddbias +
+    //           scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) *
+    //           np.mean(ddx * (x - mean), axis=(h,w)))
+    dev_ctx.template Alloc<T>(ddy);
+    set_constant(dev_ctx, ddy, static_cast<T>(0));
+    EigenArrayMap<T> ddy_arr(ddy->data<T>(), sample_size, NxC);
+    if (ddX) {
+      ddy_arr += scale_tile_data * inv_var_tile_data *
+                 (ddx_arr - ddx_arr.colwise().sum() / sample_size -
+                  x_sub_mean_mul_invstd_arr *
+                      (ddx_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
+                      sample_size);
+    }
+    if (ddScale && ddBias) {
+      ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
+      DenseTensor ddscale_tile;
+      ddscale_tile.Resize({sample_size, NxC});
+      dev_ctx.template Alloc<T>(&ddscale_tile);
+      EigenArrayMap<T> ddscale_tile_data(
+          ddscale_tile.data<T>(), sample_size, NxC);
+      ddscale_tile_data = ddscale_arr.transpose().replicate(sample_size, N);
+
+      ConstEigenVectorArrayMap<T> ddbias_arr(ddBias->data<T>(), C);
+      DenseTensor ddbias_tile;
+      ddbias_tile.Resize({sample_size, NxC});
+      dev_ctx.template Alloc<T>(&ddbias_tile);
+      EigenArrayMap<T> ddbias_tile_data(
+          ddbias_tile.data<T>(), sample_size, NxC);
+      ddbias_tile_data = ddbias_arr.transpose().replicate(sample_size, N);
+
+      ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data;
+      ddy_arr += ddbias_tile_data;
+    }
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(instance_norm_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::InstanceNormGradKernel,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(instance_norm_double_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::InstanceNormDoubleGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/instance_norm_kernel.cc b/paddle/phi/kernels/cpu/instance_norm_kernel.cc
new file mode 100644
index 0000000000000..5eac473effa0e
--- /dev/null
+++ b/paddle/phi/kernels/cpu/instance_norm_kernel.cc
@@ -0,0 +1,126 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/instance_norm_kernel.h"
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/eigen/extensions.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void InstanceNormKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const paddle::optional<DenseTensor>& scale,
+                        const paddle::optional<DenseTensor>& bias,
+                        float epsilon_f,
+                        DenseTensor* y,
+                        DenseTensor* saved_mean,
+                        DenseTensor* saved_variance) {
+  const auto& x_dims = x.dims();
+  T epsilon = static_cast<T>(epsilon_f);
+  const int N = x_dims[0];
+  const int C = x_dims[1];
+  const int NxC = N * C;
+  const int sample_size = x.numel() / N / C;
+  auto* place = dev_ctx.eigen_device();
+
+  Eigen::DSizes<int, 2> shape(NxC, sample_size);
+// Once eigen on Windows is updated, the if branch can be removed.
+#ifndef EIGEN_HAS_INDEX_LIST
+  Eigen::DSizes<int, 2> bcast(1, sample_size);
+  Eigen::DSizes<int, 2> C_shape(C, 1);
+  Eigen::DSizes<int, 2> NxC_shape(NxC, 1);
+  Eigen::DSizes<int, 1> rdims(1);
+#else
+  Eigen::IndexList<Eigen::type2index<1>, int> bcast;
+  bcast.set(1, sample_size);
+  Eigen::IndexList<int, Eigen::type2index<1>> C_shape;
+  C_shape.set(0, C);
+  Eigen::IndexList<int, Eigen::type2index<1>> NxC_shape;
+  NxC_shape.set(0, NxC);
+  Eigen::IndexList<Eigen::type2index<1>> rdims;
+#endif
+
+  phi::funcs::SetConstant<CPUContext, T> set_constant;
+  dev_ctx.template Alloc<T>(saved_mean);
+  dev_ctx.template Alloc<T>(saved_variance);
+  set_constant(dev_ctx, saved_mean, static_cast<T>(0));
+  set_constant(dev_ctx, saved_variance, static_cast<T>(0));
+
+  auto saved_mean_a = EigenVector<T>::Flatten(*saved_mean);
+  auto saved_mean_e = saved_mean_a.reshape(NxC_shape);
+  auto saved_variance_a = EigenVector<T>::Flatten(*saved_variance);
+  auto saved_variance_e = saved_variance_a.reshape(NxC_shape);
+
+  auto x_e = EigenVector<T>::Flatten(x);
+  auto x_arr = x_e.reshape(shape);
+
+  saved_mean_e.device(*place) = x_arr.mean(rdims);
+  auto saved_variance_arr =
+      (x_arr - saved_mean_e.broadcast(bcast)).square().mean(rdims) + epsilon;
+
+  saved_variance_e.device(*place) = saved_variance_arr.sqrt().inverse();
+
+  const auto scale_ptr = scale.get_ptr();
+  const auto bias_ptr = bias.get_ptr();
+
+  DenseTensor scale_data;
+  DenseTensor bias_data;
+  if (!scale_ptr) {
+    scale_data.Resize({C});
+    dev_ctx.template Alloc<T>(&scale_data);
+    set_constant(dev_ctx, &scale_data, static_cast<T>(1));
+  }
+
+  if (!bias_ptr) {
+    bias_data.Resize({C});
+    dev_ctx.template Alloc<T>(&bias_data);
+    set_constant(dev_ctx, &bias_data, static_cast<T>(0));
+  }
+  auto scale_e =
+      scale_ptr
+          ? EigenVector<T>::Flatten(*scale_ptr)
+          : EigenVector<T>::Flatten(const_cast<const DenseTensor&>(scale_data));
+  auto scale_arr = scale_e.reshape(C_shape);
+  auto bias_e =
+      bias_ptr
+          ? EigenVector<T>::Flatten(*bias_ptr)
+          : EigenVector<T>::Flatten(const_cast<const DenseTensor&>(bias_data));
+  auto bias_arr = bias_e.reshape(C_shape);
+
+  dev_ctx.template Alloc<T>(y);
+  auto y_e = EigenVector<T>::Flatten(*y);
+  auto y_arr = y_e.reshape(shape);
+
+  // (x - mean) * inv_std * scale + bias
+  Eigen::DSizes<int, 2> bcast_param(N, sample_size);
+  y_arr.device(*place) = (x_arr - saved_mean_e.broadcast(bcast)) *
+                             saved_variance_e.broadcast(bcast) *
+                             scale_arr.broadcast(bcast_param) +
+                         bias_arr.broadcast(bcast_param);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    instance_norm, CPU, ALL_LAYOUT, phi::InstanceNormKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc b/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc
index 550439a5251db..d4e13aa3b24fe 100644
--- a/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc
@@ -361,9 +361,9 @@ template <typename T, typename Context>
 static void Interpolate1DCPUBwd(
     const Context& dev_ctx,
     const DenseTensor& input,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& output_grad,
     const std::string& data_layout_str,
     int out_w,
@@ -459,9 +459,9 @@ template <typename T, typename Context>
 static void Interpolate2DCPUBwd(
     const Context& dev_ctx,
     const DenseTensor& input,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& output_grad,
     const std::string& data_layout_str,
     int out_h,
@@ -619,9 +619,9 @@ template <typename T, typename Context>
 static void Interpolate3DCPUBwd(
     const Context& dev_ctx,
     const DenseTensor& input,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& output_grad,
     const std::string& data_layout_str,
     int out_d,
@@ -800,9 +800,9 @@ template <typename T, typename Context>
 void InterpolateGradKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& output_grad,
     const std::string& data_layout,
     int out_d,
@@ -867,9 +867,9 @@ template <typename T, typename Context>
 void BilinearInterpGradKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& out_grad,
     const std::string& data_layout,
     int out_d,
@@ -901,9 +901,9 @@ template <typename T, typename Context>
 void NearestInterpGradKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& out_grad,
     const std::string& data_layout,
     int out_d,
@@ -935,9 +935,9 @@ template <typename T, typename Context>
 void TrilinearInterpGradKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& out_grad,
     const std::string& data_layout,
     int out_d,
@@ -969,9 +969,9 @@ template <typename T, typename Context>
 void LinearInterpGradKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& out_grad,
     const std::string& data_layout,
     int out_d,
@@ -1003,9 +1003,9 @@ template <typename T, typename Context>
 void BicubicInterpGradKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& out_grad,
     const std::string& data_layout,
     int out_d,
diff --git a/paddle/phi/kernels/cpu/interpolate_kernel.cc b/paddle/phi/kernels/cpu/interpolate_kernel.cc
index da9a54748f06f..5259a770568e4 100644
--- a/paddle/phi/kernels/cpu/interpolate_kernel.cc
+++ b/paddle/phi/kernels/cpu/interpolate_kernel.cc
@@ -504,9 +504,9 @@ template <typename T, typename Context>
 static void Interpolate1DCPUFwd(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout_str,
     int out_w,
     const std::vector<float>& scale,
@@ -603,9 +603,9 @@ template <typename T, typename Context>
 static void Interpolate2DCPUFwd(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout_str,
     int out_h,
     int out_w,
@@ -770,9 +770,9 @@ template <typename T, typename Context>
 static void Interpolate3DCPUFwd(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout_str,
     int out_d,
     int out_h,
@@ -966,9 +966,9 @@ template <typename T, typename Context>
 void InterpolateKernel(
     const Context& ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout,
     int out_d,
     int out_h,
@@ -1029,9 +1029,9 @@ template <typename T, typename Context>
 void BilinearInterpKernel(
     const Context& ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout,
     int out_d,
     int out_h,
@@ -1061,9 +1061,9 @@ template <typename T, typename Context>
 void NearestInterpKernel(
     const Context& ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout,
     int out_d,
     int out_h,
@@ -1093,9 +1093,9 @@ template <typename T, typename Context>
 void TrilinearInterpKernel(
     const Context& ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout,
     int out_d,
     int out_h,
@@ -1125,9 +1125,9 @@ template <typename T, typename Context>
 void LinearInterpKernel(
     const Context& ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout,
     int out_d,
     int out_h,
@@ -1157,9 +1157,9 @@ template <typename T, typename Context>
 void BicubicInterpKernel(
     const Context& ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout,
     int out_d,
     int out_h,
diff --git a/paddle/phi/kernels/cpu/label_smooth_kernel.cc b/paddle/phi/kernels/cpu/label_smooth_kernel.cc
index c76fb826cdfcc..af9548e8186bc 100644
--- a/paddle/phi/kernels/cpu/label_smooth_kernel.cc
+++ b/paddle/phi/kernels/cpu/label_smooth_kernel.cc
@@ -22,7 +22,7 @@ namespace phi {
 template <typename T, typename Context>
 void LabelSmoothKernel(const Context& ctx,
                        const DenseTensor& label,
-                       paddle::optional<const DenseTensor&> prior_dist,
+                       const paddle::optional<DenseTensor>& prior_dist,
                        float epsilon,
                        DenseTensor* out) {
   auto label_dim = label.dims()[label.dims().size() - 1];
diff --git a/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc
index 7c1b33f047b61..a30f54fd4b60e 100644
--- a/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc
@@ -32,8 +32,8 @@ namespace phi {
 template <typename T, typename Context>
 void LayerNormGradKernel(const Context& dev_ctx,
                          const DenseTensor& x,
-                         paddle::optional<const DenseTensor&> scale_opt,
-                         paddle::optional<const DenseTensor&> bias_opt,
+                         const paddle::optional<DenseTensor>& scale_opt,
+                         const paddle::optional<DenseTensor>& bias_opt,
                          const DenseTensor& mean,
                          const DenseTensor& variance,
                          const DenseTensor& out_grad,
diff --git a/paddle/phi/kernels/cpu/layer_norm_kernel.cc b/paddle/phi/kernels/cpu/layer_norm_kernel.cc
index 5b09d68c7ca08..52722468e16bd 100644
--- a/paddle/phi/kernels/cpu/layer_norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/layer_norm_kernel.cc
@@ -30,8 +30,8 @@ namespace phi {
 template <typename T, typename Context>
 void LayerNormKernel(const Context& dev_ctx,
                      const DenseTensor& x,
-                     paddle::optional<const DenseTensor&> scale_opt,
-                     paddle::optional<const DenseTensor&> bias_opt,
+                     const paddle::optional<DenseTensor>& scale_opt,
+                     const paddle::optional<DenseTensor>& bias_opt,
                      float epsilon,
                      int begin_norm_axis,
                      bool is_test,
diff --git a/paddle/phi/kernels/cpu/mv_grad_kernel.cc b/paddle/phi/kernels/cpu/mv_grad_kernel.cc
index c3b7f94be4194..c87801bb69389 100644
--- a/paddle/phi/kernels/cpu/mv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/mv_grad_kernel.cc
@@ -31,7 +31,7 @@ void MvGradKernel(const Context& dev_ctx,
   auto dx = x_grad;
   auto dvec = vec_grad;
 
-  auto dim_x = x.dims();
+  const auto& dim_x = x.dims();
   int m = dim_x[0];
   int n = dim_x[1];
 
diff --git a/paddle/phi/kernels/cpu/nanmedian_grad_kernel.cc b/paddle/phi/kernels/cpu/nanmedian_grad_kernel.cc
new file mode 100644
index 0000000000000..156124c214895
--- /dev/null
+++ b/paddle/phi/kernels/cpu/nanmedian_grad_kernel.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/nanmedian_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CalcMedianGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& median_index,
+                          const DenseTensor& out_grad,
+                          const IntArray& axes,
+                          DenseTensor* x_grad,
+                          T* x_grad_ptr) {
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(dev_ctx, x_grad, static_cast<T>(0));
+  if (!x_grad_ptr) return;
+
+  const int64_t* m_ptr = median_index.data<int64_t>();
+  const T* out_grad_ptr = out_grad.data<T>();
+  int64_t numel = x.numel();
+  auto x_dim = x.dims();
+  int64_t rank = x_dim.size();
+  int64_t stride = x_dim[rank - 1];
+
+  int64_t pre_dim = numel / stride;
+  int64_t i = 0;
+  int64_t offset = 0;
+  T div_factor = static_cast<T>(2.0);
+  for (i = 0; i < pre_dim; i++) {
+    if (m_ptr[2 * i] >= 0) {
+      if (m_ptr[2 * i] == m_ptr[2 * i + 1]) {
+        x_grad_ptr[offset + m_ptr[2 * i]] = out_grad_ptr[i];
+      } else {
+        x_grad_ptr[offset + m_ptr[2 * i]] = out_grad_ptr[i] / div_factor;
+        x_grad_ptr[offset + m_ptr[2 * i + 1]] = out_grad_ptr[i] / div_factor;
+      }
+    }
+    offset += stride;
+  }
+}
+
+template <typename T, typename Context>
+void BaseMedianGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& median_index,
+                          const DenseTensor& out_grad,
+                          const IntArray& axes,
+                          DenseTensor* x_grad) {
+  auto rank = x.dims().size();
+  T* x_grad_ptr = dev_ctx.template Alloc<T>(x_grad);
+  if (axes.size() && (rank > 1)) {
+    DenseTensor tmp_x_grad(*x_grad);
+    CalcMedianGradKernel<T, Context>(
+        dev_ctx, x, median_index, out_grad, axes, &tmp_x_grad, x_grad_ptr);
+    PostprocessMedianGradKernel<T, Context>(dev_ctx, &tmp_x_grad, axes, x_grad);
+  } else {
+    CalcMedianGradKernel<T, Context>(
+        dev_ctx, x, median_index, out_grad, axes, x_grad, x_grad_ptr);
+  }
+}
+
+template <typename T, typename Context>
+void NanmedianGradKernel(const Context& dev_ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& median_index,
+                         const DenseTensor& out_grad,
+                         const IntArray& axes,
+                         bool keep_dim,
+                         DenseTensor* x_grad) {
+  BaseMedianGradKernel<T, Context>(
+      dev_ctx, input, median_index, out_grad, axes, x_grad);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(nanmedian_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::NanmedianGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/nanmedian_kernel.cc b/paddle/phi/kernels/cpu/nanmedian_kernel.cc
new file mode 100644
index 0000000000000..ed38405c9179f
--- /dev/null
+++ b/paddle/phi/kernels/cpu/nanmedian_kernel.cc
@@ -0,0 +1,208 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/nanmedian_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/top_k_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CalcMedianFunc(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const std::vector<int64_t>& nan_counts,
+                    bool ignore_nan,
+                    int64_t sort_k,
+                    int64_t stride,
+                    int64_t pre_dim,
+                    T* o_ptr,
+                    int64_t* m_ptr) {
+  bool should_ignore_nan = ignore_nan;
+  DenseTensor sort_out;
+  DenseTensor sort_indices;
+  auto sort_dim = x.dims();
+  int64_t rank = sort_dim.size();
+  sort_dim[rank - 1] = sort_k;
+  sort_out.Resize(sort_dim);
+  sort_indices.Resize(sort_dim);
+
+  dev_ctx.template Alloc<T>(&sort_out);
+  T* sort_out_ptr = sort_out.data<T>();
+  dev_ctx.template Alloc<int64_t>(&sort_indices);
+  int64_t* sort_indices_ptr = sort_indices.data<int64_t>();
+
+  TopkKernel<T, Context>(
+      dev_ctx, x, Scalar(sort_k), -1, false, true, &sort_out, &sort_indices);
+
+  T div_factor = static_cast<T>(2.0);
+  int64_t offset = 0;
+  int64_t i = 0;
+  bool is_ori_odd = stride & 1;
+  if (should_ignore_nan) {
+    for (i = 0; i < pre_dim; i++) {
+      offset = i * sort_k;
+      if (nan_counts[i] == stride) {
+        m_ptr[i * 2] = -1;
+        m_ptr[i * 2 + 1] = -1;
+        o_ptr[i] = sort_out_ptr[offset];
+      } else {
+        int64_t nan_k = nan_counts[i] > 0
+                            ? static_cast<int64_t>(stride - nan_counts[i])
+                            : sort_k;
+        int64_t row_pos = static_cast<int64_t>(nan_k >> 1);
+        int64_t pos = offset + row_pos;
+        if (nan_k & 1) {
+          m_ptr[2 * i] = sort_indices_ptr[pos];
+          m_ptr[2 * i + 1] = sort_indices_ptr[pos];
+          o_ptr[i] = sort_out_ptr[pos];
+        } else {
+          m_ptr[2 * i] =
+              row_pos > 0 ? sort_indices_ptr[pos - 1] : sort_indices_ptr[pos];
+          m_ptr[2 * i + 1] = sort_indices_ptr[pos];
+          T m_val_left =
+              row_pos > 0 ? sort_out_ptr[pos - 1] : sort_out_ptr[pos];
+          T m_val_right = sort_out_ptr[pos];
+          o_ptr[i] = (m_val_left + m_val_right) / div_factor;
+        }
+      }
+    }
+  } else {
+    if (is_ori_odd) {
+      for (i = 0; i < pre_dim; i++) {
+        offset = i * sort_k;
+        int64_t pos = offset + sort_k - 1;
+        o_ptr[i] = sort_out_ptr[pos];
+        m_ptr[2 * i] = sort_indices_ptr[pos];
+        m_ptr[2 * i + 1] = sort_indices_ptr[pos];
+      }
+    } else {
+      for (i = 0; i < pre_dim; i++) {
+        offset = i * sort_k;
+        int64_t pos = offset + sort_k - 1;
+        m_ptr[2 * i] =
+            sort_k > 1 ? sort_indices_ptr[pos - 1] : sort_indices_ptr[pos];
+        m_ptr[2 * i + 1] = sort_indices_ptr[pos];
+        T m_val_left = sort_k > 1 ? sort_out_ptr[pos - 1] : sort_out_ptr[pos];
+        T m_val_right = sort_out_ptr[pos];
+        o_ptr[i] = (m_val_left + m_val_right) / div_factor;
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ProcessMedianKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         T* o_ptr,
+                         int64_t* m_ptr,
+                         bool ignore_nan) {
+  bool should_ignore_nan = ignore_nan;
+  const T* x_ptr = x.data<T>();
+
+  int64_t numel = x.numel();
+  auto x_dim = x.dims();
+  int64_t x_rank = x_dim.size();
+  int64_t stride = x_dim[x_rank - 1];
+  int64_t pre_dim = numel / stride;
+  int64_t i = 0;
+
+  int64_t max_valid_num = 0;
+  std::vector<int64_t> nan_counts;
+  if (should_ignore_nan) {
+    int64_t total_nan_num = 0;
+    std::vector<T> col_vec;
+    col_vec.reserve(stride);
+    col_vec.resize(stride);
+    nan_counts.clear();
+    nan_counts.reserve(pre_dim);
+    nan_counts.resize(pre_dim);
+    for (int64_t i = 0; i < pre_dim; i++) {
+      col_vec.clear();
+      col_vec.insert(
+          col_vec.begin(), x_ptr + i * stride, x_ptr + (i + 1) * stride);
+      nan_counts[i] =
+          std::count_if(col_vec.begin(), col_vec.end(), [&](const T& val) {
+            return std::isnan(static_cast<float>(val));
+          });
+      total_nan_num += nan_counts[i];
+      if (stride - nan_counts[i] > max_valid_num)
+        max_valid_num = stride - nan_counts[i];
+    }
+    // all elems are nan
+    if (total_nan_num == numel) {
+      for (i = 0; i < pre_dim; i++) {
+        o_ptr[i] = x_ptr[0];
+        m_ptr[2 * i] = -1;
+        m_ptr[2 * i + 1] = -1;
+      }
+      return;
+    }
+    should_ignore_nan = total_nan_num > 0;
+  }
+
+  int64_t sort_k = should_ignore_nan ? max_valid_num : ((stride >> 1) + 1);
+  CalcMedianFunc<T, Context>(dev_ctx,
+                             x,
+                             nan_counts,
+                             should_ignore_nan,
+                             sort_k,
+                             stride,
+                             pre_dim,
+                             o_ptr,
+                             m_ptr);
+}
+
+template <typename T, typename Context>
+void BaseMedianKernel(const Context& dev_ctx,
+                      const DenseTensor& input,
+                      const IntArray& axes,
+                      DenseTensor* out,
+                      DenseTensor* median_index,
+                      bool ignore_nan) {
+  DenseTensor x;
+  auto rank = input.dims().size();
+  if ((axes.size() == 0) || rank <= 1) {
+    x = input;
+    x.Resize({input.numel()});
+  } else {
+    PreprocessMedianKernel<T, Context>(dev_ctx, input, axes, &x);
+  }
+
+  T* o_ptr = dev_ctx.template Alloc<T>(out);
+  int64_t* m_ptr = dev_ctx.template Alloc<int64_t>(median_index);
+  ProcessMedianKernel<T, Context>(dev_ctx, x, o_ptr, m_ptr, ignore_nan);
+  out->Resize(out->dims());
+}
+
+template <typename T, typename Context>
+void NanmedianKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const IntArray& axes,
+                     bool keepdim,
+                     DenseTensor* out,
+                     DenseTensor* median_index) {
+  BaseMedianKernel<T, Context>(dev_ctx, x, axes, out, median_index, true);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(nanmedian,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::NanmedianKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc
index 5b859b6ec270e..dd2b09ee39acb 100644
--- a/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc
@@ -121,7 +121,7 @@ template <typename T, typename Context>
 void NllLossGradKernel(const Context& dev_ctx,
                        const DenseTensor& x,
                        const DenseTensor& labels,
-                       paddle::optional<const DenseTensor&> weight,
+                       const paddle::optional<DenseTensor>& weight,
                        const DenseTensor& total_weight,
                        const DenseTensor& d_out,
                        int64_t ignore_index,
diff --git a/paddle/phi/kernels/cpu/nll_loss_kernel.cc b/paddle/phi/kernels/cpu/nll_loss_kernel.cc
index 334b0082bde57..92cb6a1ad17de 100644
--- a/paddle/phi/kernels/cpu/nll_loss_kernel.cc
+++ b/paddle/phi/kernels/cpu/nll_loss_kernel.cc
@@ -154,7 +154,7 @@ template <typename T, typename Context>
 void NllLossRawKernel(const Context& dev_ctx,
                       const DenseTensor& x,
                       const DenseTensor& labels,
-                      paddle::optional<const DenseTensor&> weight,
+                      const paddle::optional<DenseTensor>& weight,
                       int64_t ignore_index,
                       const std::string& reduction,
                       DenseTensor* out,
diff --git a/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc
index fbed3f1cb133a..b68c3ad545d33 100644
--- a/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc
@@ -24,7 +24,7 @@ template <typename T, typename Context>
 void PsroiPoolGradKernel(const Context& ctx,
                          const DenseTensor& x,
                          const DenseTensor& rois,
-                         paddle::optional<const DenseTensor&> rois_num,
+                         const paddle::optional<DenseTensor>& rois_num,
                          const DenseTensor& dout,
                          int pooled_height,
                          int pooled_width,
@@ -32,7 +32,7 @@ void PsroiPoolGradKernel(const Context& ctx,
                          float spatial_scale,
                          DenseTensor* dx) {
   if (dx) {
-    auto in_dims = x.dims();
+    const auto& in_dims = x.dims();
     int input_channels = in_dims[1];
     int height = in_dims[2];
     int width = in_dims[3];
diff --git a/paddle/phi/kernels/cpu/psroi_pool_kernel.cc b/paddle/phi/kernels/cpu/psroi_pool_kernel.cc
index 06cd03395d965..4f7925ad00f5a 100644
--- a/paddle/phi/kernels/cpu/psroi_pool_kernel.cc
+++ b/paddle/phi/kernels/cpu/psroi_pool_kernel.cc
@@ -23,7 +23,7 @@ template <typename T, typename Context>
 void PsroiPoolKernel(const Context& ctx,
                      const DenseTensor& x,
                      const DenseTensor& rois,
-                     paddle::optional<const DenseTensor&> rois_num,
+                     const paddle::optional<DenseTensor>& rois_num,
                      int pooled_height,
                      int pooled_width,
                      int output_channels,
diff --git a/paddle/phi/kernels/cpu/reduce.h b/paddle/phi/kernels/cpu/reduce.h
index b0e43b6526cdd..35395dccca1af 100644
--- a/paddle/phi/kernels/cpu/reduce.h
+++ b/paddle/phi/kernels/cpu/reduce.h
@@ -20,7 +20,6 @@
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/cast_kernel.h"
 
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/cpu/rnn_functor.h b/paddle/phi/kernels/cpu/rnn_functor.h
index 961bc7a214be5..ab6f98ffcd5d6 100644
--- a/paddle/phi/kernels/cpu/rnn_functor.h
+++ b/paddle/phi/kernels/cpu/rnn_functor.h
@@ -330,7 +330,7 @@ void RnnFunc(const Context& dev_ctx,
     }
   }
 
-  DenseTensor* input_holder;
+  DenseTensor* input_holder = nullptr;
   DenseTensor* output_holder = output;
   bool has_allocate_mem = false;
 
diff --git a/paddle/phi/kernels/cpu/rnn_grad_kernel.cc b/paddle/phi/kernels/cpu/rnn_grad_kernel.cc
index 9b5e5cb5443b1..4dd1894320af7 100644
--- a/paddle/phi/kernels/cpu/rnn_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/rnn_grad_kernel.cc
@@ -969,7 +969,7 @@ void RnnGradFunc(const CPUContext& dev_ctx,
                  const DenseTensor& x,
                  const std::vector<const DenseTensor*>& pre_state,
                  const std::vector<const DenseTensor*>& weight_list,
-                 paddle::optional<const DenseTensor&> sequence_length,
+                 const paddle::optional<DenseTensor>& sequence_length,
                  const DenseTensor& out,
                  const DenseTensor& dropout_state,
                  const DenseTensor& reserve,
@@ -1244,7 +1244,7 @@ void RnnGradKernel(const Context& dev_ctx,
                    const DenseTensor& x,
                    const std::vector<const DenseTensor*>& pre_state,
                    const std::vector<const DenseTensor*>& weight_list,
-                   paddle::optional<const DenseTensor&> sequence_length,
+                   const paddle::optional<DenseTensor>& sequence_length,
                    const DenseTensor& out,
                    const DenseTensor& dropout_state,
                    const DenseTensor& reserve,
diff --git a/paddle/phi/kernels/cpu/rnn_kernel.cc b/paddle/phi/kernels/cpu/rnn_kernel.cc
index ae2c7a72635f7..80c521918ed07 100644
--- a/paddle/phi/kernels/cpu/rnn_kernel.cc
+++ b/paddle/phi/kernels/cpu/rnn_kernel.cc
@@ -819,7 +819,7 @@ void RnnKernel(const Context& dev_ctx,
                const DenseTensor& x,
                const std::vector<const DenseTensor*>& pre_state,
                const std::vector<const DenseTensor*>& weight_list,
-               paddle::optional<const DenseTensor&> sequence_length,
+               const paddle::optional<DenseTensor>& sequence_length,
                float dropout_prob,
                bool is_bidirec,
                int input_size,
diff --git a/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc b/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc
index a91b8b6c1fcd3..ea01121509f1a 100644
--- a/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc
@@ -73,7 +73,7 @@ template <typename T, typename Context>
 void RoiAlignGradKernel(const Context& dev_ctx,
                         const DenseTensor& x,
                         const DenseTensor& boxes,
-                        paddle::optional<const DenseTensor&> boxes_num,
+                        const paddle::optional<DenseTensor>& boxes_num,
                         const DenseTensor& out_grad,
                         int pooled_height,
                         int pooled_width,
diff --git a/paddle/phi/kernels/cpu/roi_align_kernel.cc b/paddle/phi/kernels/cpu/roi_align_kernel.cc
index 4752a9b3a48fd..cd779b72e7a84 100644
--- a/paddle/phi/kernels/cpu/roi_align_kernel.cc
+++ b/paddle/phi/kernels/cpu/roi_align_kernel.cc
@@ -182,7 +182,7 @@ template <typename T, typename Context>
 void RoiAlignKernel(const Context& dev_ctx,
                     const DenseTensor& x,
                     const DenseTensor& boxes,
-                    paddle::optional<const DenseTensor&> boxes_num,
+                    const paddle::optional<DenseTensor>& boxes_num,
                     int pooled_height,
                     int pooled_width,
                     float spatial_scale,
diff --git a/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc
index 0eaa873590eb0..f2fcfa5648d3f 100644
--- a/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc
@@ -25,7 +25,7 @@ template <typename T, typename Context>
 void RoiPoolGradKernel(const Context& dev_ctx,
                        const DenseTensor& x,
                        const DenseTensor& boxes,
-                       paddle::optional<const DenseTensor&> boxes_num,
+                       const paddle::optional<DenseTensor>& boxes_num,
                        const DenseTensor& arg_max,
                        const DenseTensor& out_grad,
                        int pooled_height,
diff --git a/paddle/phi/kernels/cpu/roi_pool_kernel.cc b/paddle/phi/kernels/cpu/roi_pool_kernel.cc
index 02020354cd357..e088e9a2831cb 100644
--- a/paddle/phi/kernels/cpu/roi_pool_kernel.cc
+++ b/paddle/phi/kernels/cpu/roi_pool_kernel.cc
@@ -24,7 +24,7 @@ template <typename T, typename Context>
 void RoiPoolKernel(const Context& dev_ctx,
                    const DenseTensor& x,
                    const DenseTensor& boxes,
-                   paddle::optional<const DenseTensor&> boxes_num,
+                   const paddle::optional<DenseTensor>& boxes_num,
                    int pooled_height,
                    int pooled_width,
                    float spatial_scale,
diff --git a/paddle/phi/kernels/cpu/rrelu_grad_kernel.cc b/paddle/phi/kernels/cpu/rrelu_grad_kernel.cc
new file mode 100644
index 0000000000000..10b6c6b1a3ea8
--- /dev/null
+++ b/paddle/phi/kernels/cpu/rrelu_grad_kernel.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/rrelu_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RReluGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& noise,
+                     const DenseTensor& out_grad,
+                     DenseTensor* x_grad) {
+  const T* n_ptr = noise.data<T>();
+  const T* x_ptr = x.data<T>();
+  const T* out_grad_ptr = out_grad.data<T>();
+  int numel = x.numel();
+  if (!x_grad) return;
+
+  int i = 0;
+  T* x_grad_ptr = dev_ctx.template Alloc<T>(x_grad);
+  for (i = 0; i < numel; i++) {
+    x_grad_ptr[i] = x_ptr[i] > 0 ? out_grad_ptr[i] : n_ptr[i] * out_grad_ptr[i];
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    rrelu_grad, CPU, ALL_LAYOUT, phi::RReluGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/rrelu_kernel.cc b/paddle/phi/kernels/cpu/rrelu_kernel.cc
new file mode 100644
index 0000000000000..4c6e30beddfa3
--- /dev/null
+++ b/paddle/phi/kernels/cpu/rrelu_kernel.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/rrelu_kernel.h"
+
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RReluKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const float lower,
+                 const float upper,
+                 bool is_test,
+                 DenseTensor* out,
+                 DenseTensor* noise) {
+  const T* x_ptr = x.data<T>();
+  T* o_ptr = dev_ctx.template Alloc<T>(out);
+  T* n_ptr = dev_ctx.template Alloc<T>(noise);
+  T zero = static_cast<T>(0);
+  int numel = x.numel();
+  int i = 0;
+
+  if (is_test) {
+    T mid_val = static_cast<T>((lower + upper) / 2.0);
+    for (i = 0; i < numel; i++) {
+      if (x_ptr[i] < zero) {
+        o_ptr[i] = mid_val * x_ptr[i];
+        n_ptr[i] = mid_val;
+      } else {
+        o_ptr[i] = x_ptr[i];
+        n_ptr[i] = 1.0;
+      }
+    }
+
+    return;
+  }
+
+  auto engine = paddle::framework::GetCPURandomEngine(0);
+
+  std::uniform_real_distribution<float> dist(lower, upper);
+
+  for (i = 0; i < numel; i++) {
+    if (x_ptr[i] < zero) {
+      T scale = static_cast<T>(dist(*engine));
+      o_ptr[i] = scale * x_ptr[i];
+      n_ptr[i] = scale;
+    } else {
+      o_ptr[i] = x_ptr[i];
+      n_ptr[i] = 1.0;
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(rrelu,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::RReluKernel,
+                   float,
+                   phi::dtype::float16,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/sgd_kernel.cc b/paddle/phi/kernels/cpu/sgd_kernel.cc
index c7b4074c70aaa..214fd82bef358 100644
--- a/paddle/phi/kernels/cpu/sgd_kernel.cc
+++ b/paddle/phi/kernels/cpu/sgd_kernel.cc
@@ -118,7 +118,7 @@ void SGDDenseKernel(const Context& dev_ctx,
                     const DenseTensor& param,
                     const DenseTensor& learning_rate,
                     const DenseTensor& grad,
-                    paddle::optional<const DenseTensor&> master_param,
+                    const paddle::optional<DenseTensor>& master_param,
                     bool multi_precision,
                     DenseTensor* param_out,
                     DenseTensor* master_param_out) {
@@ -132,7 +132,7 @@ void SGDDenseParamSparseGradKernel(
     const DenseTensor& param,
     const DenseTensor& learning_rate,
     const SelectedRows& grad,
-    paddle::optional<const DenseTensor&> master_param,
+    const paddle::optional<DenseTensor>& master_param,
     bool multi_precision,
     DenseTensor* param_out,
     DenseTensor* master_param_out) {
@@ -146,7 +146,7 @@ void SGDSparseParamSparseGradKernel(
     const SelectedRows& param,
     const DenseTensor& learning_rate,
     const SelectedRows& grad,
-    paddle::optional<const SelectedRows&> master_param,
+    const paddle::optional<SelectedRows>& master_param,
     bool multi_precision,
     SelectedRows* param_out,
     SelectedRows* master_param_out) {
diff --git a/paddle/phi/kernels/cpu/tril_indices_kernel.cc b/paddle/phi/kernels/cpu/tril_indices_kernel.cc
index c515a69f011d5..71c5cd820b383 100644
--- a/paddle/phi/kernels/cpu/tril_indices_kernel.cc
+++ b/paddle/phi/kernels/cpu/tril_indices_kernel.cc
@@ -26,7 +26,7 @@ void TrilIndicesKernel(const Context& dev_ctx,
                        DataType dtype,
                        DenseTensor* out) {
   T* out_data = dev_ctx.template Alloc<T>(out);
-  auto out_dims = out->dims();
+  const auto& out_dims = out->dims();
   int64_t tril_size = out_dims[1];
   int64_t i = 0;
   T r = std::max<int64_t>(0, -offset), c = 0;
diff --git a/paddle/phi/kernels/cpu/uniform_random_kernel.cc b/paddle/phi/kernels/cpu/uniform_random_kernel.cc
index 91a6903418230..c95a8f4ded6dc 100644
--- a/paddle/phi/kernels/cpu/uniform_random_kernel.cc
+++ b/paddle/phi/kernels/cpu/uniform_random_kernel.cc
@@ -54,7 +54,6 @@ void UniformRandomRawKernel(const Context &dev_ctx,
                             float diag_val,
                             DenseTensor *out) {
   out->Resize(phi::make_ddim(shape.GetData()));
-  VLOG(4) << out->dims();
   T *data = dev_ctx.template Alloc<T>(out);
   auto size = out->numel();
   std::shared_ptr<std::mt19937_64> engine;
diff --git a/paddle/phi/kernels/cpu/yolov3_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/yolov3_loss_grad_kernel.cc
index acd9a99cef4de..383009229f9a1 100644
--- a/paddle/phi/kernels/cpu/yolov3_loss_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/yolov3_loss_grad_kernel.cc
@@ -121,7 +121,7 @@ void Yolov3LossGradKernel(const Context& dev_ctx,
                           const DenseTensor& x,
                           const DenseTensor& gt_box,
                           const DenseTensor& gt_label,
-                          paddle::optional<const DenseTensor&> gt_score,
+                          const paddle::optional<DenseTensor>& gt_score,
                           const DenseTensor& loss_grad,
                           const DenseTensor& objectness_mask,
                           const DenseTensor& gt_match_mask,
diff --git a/paddle/phi/kernels/cpu/yolov3_loss_kernel.cc b/paddle/phi/kernels/cpu/yolov3_loss_kernel.cc
index 6df910eea02a9..8a190ab25a7b2 100644
--- a/paddle/phi/kernels/cpu/yolov3_loss_kernel.cc
+++ b/paddle/phi/kernels/cpu/yolov3_loss_kernel.cc
@@ -182,7 +182,7 @@ void Yolov3LossKernel(const Context& dev_ctx,
                       const DenseTensor& x,
                       const DenseTensor& gt_box,
                       const DenseTensor& gt_label,
-                      paddle::optional<const DenseTensor&> gt_score,
+                      const paddle::optional<DenseTensor>& gt_score,
                       const std::vector<int>& anchors,
                       const std::vector<int>& anchor_mask,
                       int class_num,
diff --git a/paddle/phi/kernels/deformable_conv_grad_kernel.h b/paddle/phi/kernels/deformable_conv_grad_kernel.h
index 85786cec4c3e5..04fe7904a4509 100644
--- a/paddle/phi/kernels/deformable_conv_grad_kernel.h
+++ b/paddle/phi/kernels/deformable_conv_grad_kernel.h
@@ -23,7 +23,7 @@ void DeformableConvGradKernel(const Context& dev_ctx,
                               const DenseTensor& x,
                               const DenseTensor& offset,
                               const DenseTensor& filter,
-                              paddle::optional<const DenseTensor&> mask,
+                              const paddle::optional<DenseTensor>& mask,
                               const DenseTensor& out_grad,
                               const std::vector<int>& strides,
                               const std::vector<int>& paddings,
diff --git a/paddle/phi/kernels/deformable_conv_kernel.h b/paddle/phi/kernels/deformable_conv_kernel.h
index fbbe5f62c6a29..7b66e506b8928 100644
--- a/paddle/phi/kernels/deformable_conv_kernel.h
+++ b/paddle/phi/kernels/deformable_conv_kernel.h
@@ -24,7 +24,7 @@ void DeformableConvKernel(const Context& dev_ctx,
                           const DenseTensor& x,
                           const DenseTensor& offset,
                           const DenseTensor& filter,
-                          paddle::optional<const DenseTensor&> mask,
+                          const paddle::optional<DenseTensor>& mask,
                           const std::vector<int>& strides,
                           const std::vector<int>& paddings,
                           const std::vector<int>& dilations,
diff --git a/paddle/phi/kernels/dropout_kernel.h b/paddle/phi/kernels/dropout_kernel.h
index dc9f89e08e17a..6febcd78e1107 100644
--- a/paddle/phi/kernels/dropout_kernel.h
+++ b/paddle/phi/kernels/dropout_kernel.h
@@ -22,7 +22,7 @@ namespace phi {
 template <typename T, typename Context>
 void DropoutRawKernel(const Context& dev_ctx,
                       const DenseTensor& x,
-                      paddle::optional<const DenseTensor&> seed_tensor,
+                      const paddle::optional<DenseTensor>& seed_tensor,
                       float p,
                       bool is_test,
                       const std::string& mode,
diff --git a/paddle/phi/kernels/einsum_grad_kernel.h b/paddle/phi/kernels/einsum_grad_kernel.h
index 5c1970e775825..06785c8532e70 100644
--- a/paddle/phi/kernels/einsum_grad_kernel.h
+++ b/paddle/phi/kernels/einsum_grad_kernel.h
@@ -21,6 +21,7 @@ namespace phi {
 template <typename T, typename Context>
 void EinsumGradKernel(const Context& dev_ctx,
                       const std::vector<const DenseTensor*>& x,
+                      const std::vector<const DenseTensor*>& inner_cache,
                       const DenseTensor& out_grad,
                       const std::string& equation,
                       std::vector<DenseTensor*> x_grad);
diff --git a/paddle/phi/kernels/einsum_kernel.h b/paddle/phi/kernels/einsum_kernel.h
index 3d9e8feda748d..87df2b1c64a4a 100644
--- a/paddle/phi/kernels/einsum_kernel.h
+++ b/paddle/phi/kernels/einsum_kernel.h
@@ -24,4 +24,11 @@ void EinsumKernel(const Context& dev_ctx,
                   const std::string& equation,
                   DenseTensor* out);
 
+template <typename T, typename Context>
+void EinsumKernelRaw(const Context& dev_ctx,
+                     const std::vector<const DenseTensor*>& inputs,
+                     const std::string& equation,
+                     DenseTensor* out,
+                     std::vector<DenseTensor*> cache);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/elementwise_add_grad_kernel.h b/paddle/phi/kernels/elementwise_add_grad_kernel.h
index 9b754cfefe365..8fc31c8878b6a 100644
--- a/paddle/phi/kernels/elementwise_add_grad_kernel.h
+++ b/paddle/phi/kernels/elementwise_add_grad_kernel.h
@@ -32,8 +32,8 @@ template <typename T, typename Context>
 void AddDoubleGradKernel(const Context& dev_ctx,
                          const DenseTensor& y,
                          const DenseTensor& dout,
-                         paddle::optional<const DenseTensor&> ddx,
-                         paddle::optional<const DenseTensor&> ddy,
+                         const paddle::optional<DenseTensor>& ddx,
+                         const paddle::optional<DenseTensor>& ddy,
                          int axis,
                          DenseTensor* ddout);
 
diff --git a/paddle/phi/kernels/elementwise_divide_grad_kernel.h b/paddle/phi/kernels/elementwise_divide_grad_kernel.h
index 6d29dae99a131..c764f05c3983f 100644
--- a/paddle/phi/kernels/elementwise_divide_grad_kernel.h
+++ b/paddle/phi/kernels/elementwise_divide_grad_kernel.h
@@ -34,8 +34,8 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
                             const DenseTensor& y,
                             const DenseTensor& out,
                             const DenseTensor& dx,
-                            paddle::optional<const DenseTensor&> ddx,
-                            paddle::optional<const DenseTensor&> ddy,
+                            const paddle::optional<DenseTensor>& ddx,
+                            const paddle::optional<DenseTensor>& ddy,
                             int axis,
                             DenseTensor* dy,
                             DenseTensor* dout,
diff --git a/paddle/phi/kernels/elementwise_multiply_grad_kernel.h b/paddle/phi/kernels/elementwise_multiply_grad_kernel.h
index 517948a50d1b1..9cbd5040666cf 100644
--- a/paddle/phi/kernels/elementwise_multiply_grad_kernel.h
+++ b/paddle/phi/kernels/elementwise_multiply_grad_kernel.h
@@ -33,8 +33,8 @@ void MultiplyDoubleGradKernel(const Context& dev_ctx,
                               const DenseTensor& x,
                               const DenseTensor& y,
                               const DenseTensor& dout,
-                              paddle::optional<const DenseTensor&> ddx,
-                              paddle::optional<const DenseTensor&> ddy,
+                              const paddle::optional<DenseTensor>& ddx,
+                              const paddle::optional<DenseTensor>& ddy,
                               int axis,
                               DenseTensor* dx,
                               DenseTensor* dy,
@@ -45,11 +45,11 @@ void MultiplyTripleGradKernel(const Context& dev_ctx,
                               const DenseTensor& x,
                               const DenseTensor& y,
                               const DenseTensor& dout,
-                              paddle::optional<const DenseTensor&> ddx,
-                              paddle::optional<const DenseTensor&> ddy,
+                              const paddle::optional<DenseTensor>& ddx,
+                              const paddle::optional<DenseTensor>& ddy,
                               const DenseTensor& d_dx,
                               const DenseTensor& d_dy,
-                              paddle::optional<const DenseTensor&> d_ddout,
+                              const paddle::optional<DenseTensor>& d_ddout,
                               int axis,
                               DenseTensor* d_x,
                               DenseTensor* d_y,
diff --git a/paddle/phi/kernels/elementwise_subtract_grad_kernel.h b/paddle/phi/kernels/elementwise_subtract_grad_kernel.h
index 97df769f4d046..536d859b46a7b 100644
--- a/paddle/phi/kernels/elementwise_subtract_grad_kernel.h
+++ b/paddle/phi/kernels/elementwise_subtract_grad_kernel.h
@@ -31,8 +31,8 @@ template <typename T, typename Context>
 void SubtractDoubleGradKernel(const Context& dev_ctx,
                               const DenseTensor& y,
                               const DenseTensor& dout,
-                              paddle::optional<const DenseTensor&> ddx,
-                              paddle::optional<const DenseTensor&> ddy,
+                              const paddle::optional<DenseTensor>& ddx,
+                              const paddle::optional<DenseTensor>& ddy,
                               int axis,
                               DenseTensor* ddout);
 
diff --git a/paddle/phi/kernels/expand_as_kernel.h b/paddle/phi/kernels/expand_as_kernel.h
index 971ea32310f3e..6bc6c73e737d7 100644
--- a/paddle/phi/kernels/expand_as_kernel.h
+++ b/paddle/phi/kernels/expand_as_kernel.h
@@ -21,7 +21,7 @@ namespace phi {
 template <typename T, typename Context>
 void ExpandAsKernel(const Context& ctx,
                     const DenseTensor& x,
-                    paddle::optional<const DenseTensor&> y,
+                    const paddle::optional<DenseTensor>& y,
                     const std::vector<int>& target_shape,
                     DenseTensor* out);
 
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.h b/paddle/phi/kernels/funcs/blas/blas_impl.h
index 2868aa5acb75e..db4796b3f61ca 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.h
@@ -1284,9 +1284,9 @@ void Blas<DeviceContext>::MatMul(const phi::DenseTensor &mat_a,
                                  T alpha,
                                  phi::DenseTensor *mat_out,
                                  T beta) const {
-  auto dim_a = mat_a.dims();
-  auto dim_b = mat_b.dims();
-  auto dim_out = mat_out->dims();
+  const auto &dim_a = mat_a.dims();
+  const auto &dim_b = mat_b.dims();
+  const auto &dim_out = mat_out->dims();
   PADDLE_ENFORCE_EQ(
       dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
       true,
diff --git a/paddle/phi/kernels/funcs/eigen/broadcast.cc b/paddle/phi/kernels/funcs/eigen/broadcast.cc
index 3459d7acd6baf..008c51249f249 100644
--- a/paddle/phi/kernels/funcs/eigen/broadcast.cc
+++ b/paddle/phi/kernels/funcs/eigen/broadcast.cc
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 
@@ -73,6 +74,7 @@ struct EigenBroadcastGrad<Eigen::DefaultDevice, T, Rank> {
   template struct FUNCTOR<Eigen::DefaultDevice, T, 6>
 INSTANTIATION(EigenBroadcast, bool);
 INSTANTIATION(EigenBroadcast, dtype::float16);
+INSTANTIATION(EigenBroadcast, dtype::bfloat16);
 INSTANTIATION(EigenBroadcast, float);
 INSTANTIATION(EigenBroadcast, double);
 INSTANTIATION(EigenBroadcast, int);
diff --git a/paddle/phi/kernels/funcs/eigen/broadcast.cu b/paddle/phi/kernels/funcs/eigen/broadcast.cu
index d9de69ec55e8b..742081a30c1a0 100644
--- a/paddle/phi/kernels/funcs/eigen/broadcast.cu
+++ b/paddle/phi/kernels/funcs/eigen/broadcast.cu
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 
@@ -73,6 +74,7 @@ struct EigenBroadcastGrad<Eigen::GpuDevice, T, Rank> {
   template struct FUNCTOR<Eigen::GpuDevice, T, 6>
 INSTANTIATION(EigenBroadcast, bool);
 INSTANTIATION(EigenBroadcast, dtype::float16);
+INSTANTIATION(EigenBroadcast, dtype::bfloat16);
 INSTANTIATION(EigenBroadcast, float);
 INSTANTIATION(EigenBroadcast, double);
 INSTANTIATION(EigenBroadcast, int);
diff --git a/paddle/phi/kernels/funcs/norm_utils.h b/paddle/phi/kernels/funcs/norm_utils.h
new file mode 100644
index 0000000000000..2d0a879e41c78
--- /dev/null
+++ b/paddle/phi/kernels/funcs/norm_utils.h
@@ -0,0 +1,46 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/ddim.h"
+
+namespace phi {
+namespace funcs {
+inline void ExtractNCWHD(const phi::DDim &dims,
+                         const DataLayout &data_layout,
+                         int *N,
+                         int *C,
+                         int *H,
+                         int *W,
+                         int *D) {
+  *N = dims[0];
+  if (dims.size() == 2) {
+    *C = dims[1];
+    *H = 1;
+    *W = 1;
+    *D = 1;
+  } else {
+    *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
+    *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
+    *W = dims.size() > 3
+             ? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2])
+             : 1;
+    *D = dims.size() > 4
+             ? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3])
+             : 1;
+  }
+}
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/pooling.cu b/paddle/phi/kernels/funcs/pooling.cu
index 417c1cd234754..b0e68abc08a57 100644
--- a/paddle/phi/kernels/funcs/pooling.cu
+++ b/paddle/phi/kernels/funcs/pooling.cu
@@ -170,7 +170,7 @@ template <typename T, typename PoolProcess>
 __global__ void KernelPool2DGrad(const int nthreads,
                                  const T* __restrict__ input_data,
                                  const T* __restrict__ output_data,
-                                 const const T* __restrict__ output_grad,
+                                 const T* __restrict__ output_grad,
                                  const int output_width,
                                  const int output_height,
                                  const int input_width,
diff --git a/paddle/phi/kernels/funcs/segment_pooling.cc b/paddle/phi/kernels/funcs/segment_pooling.cc
index fbd744430aa11..e6bd371935622 100644
--- a/paddle/phi/kernels/funcs/segment_pooling.cc
+++ b/paddle/phi/kernels/funcs/segment_pooling.cc
@@ -90,7 +90,7 @@ class SegmentPoolGradFunctor<phi::CPUContext, T, IndexT> {
                   const DenseTensor& out_grad,
                   const DenseTensor& segments,
                   DenseTensor* in_grad,
-                  paddle::optional<const DenseTensor&> index,
+                  const paddle::optional<DenseTensor>& index,
                   const std::string pooltype = "SUM") {
     const IndexT* segment_ids = segments.data<IndexT>();
     auto& place = *dev_ctx.eigen_device();
diff --git a/paddle/phi/kernels/funcs/segment_pooling.cu b/paddle/phi/kernels/funcs/segment_pooling.cu
index 95606b1526729..687cccb1f64f9 100644
--- a/paddle/phi/kernels/funcs/segment_pooling.cu
+++ b/paddle/phi/kernels/funcs/segment_pooling.cu
@@ -417,7 +417,7 @@ class SegmentPoolGradFunctor<phi::GPUContext, T, IndexT> {
                   const DenseTensor& out_grad,
                   const DenseTensor& segments,
                   DenseTensor* in_grad,
-                  paddle::optional<const DenseTensor&> summed_ids,
+                  const paddle::optional<DenseTensor>& summed_ids,
                   const std::string pooltype = "SUM") {
     if (pooltype == "MAX" || pooltype == "MIN") {
       SegmentPoolCUDAGradFunctor<T, IndexT>(
diff --git a/paddle/phi/kernels/funcs/segment_pooling.h b/paddle/phi/kernels/funcs/segment_pooling.h
index b8281061582ea..09da9eb304773 100644
--- a/paddle/phi/kernels/funcs/segment_pooling.h
+++ b/paddle/phi/kernels/funcs/segment_pooling.h
@@ -41,7 +41,7 @@ class SegmentPoolGradFunctor {
                   const DenseTensor& out_grad,
                   const DenseTensor& segments,
                   DenseTensor* in_grad,
-                  paddle::optional<const DenseTensor&> summed_ids,
+                  const paddle::optional<DenseTensor>& summed_ids,
                   const std::string pooltype = "SUM");
 };
 
diff --git a/paddle/phi/kernels/funcs/values_vectors_functor.h b/paddle/phi/kernels/funcs/values_vectors_functor.h
index 336e9c809427c..a6a6d4097030b 100644
--- a/paddle/phi/kernels/funcs/values_vectors_functor.h
+++ b/paddle/phi/kernels/funcs/values_vectors_functor.h
@@ -27,10 +27,10 @@
 namespace phi {
 namespace funcs {
 
-inline int64_t GetBatchSize(phi::DDim dims) {
+inline int64_t GetBatchSize(const phi::DDim &dims) {
   int64_t batch_size = 1;
   auto dim_size = dims.size();
-  for (int i = 0; i < dim_size - 2; i++) {
+  for (int i = 0; i < dim_size - 2; ++i) {
     batch_size *= dims[i];
   }
   return batch_size;
@@ -54,6 +54,24 @@ static void CheckEighResult(const int batch, const int info) {
           info));
 }
 
+#ifdef PADDLE_WITH_CUDA
+static void CheckEighResult(const GPUContext &dev_ctx,
+                            const int64_t batch_size,
+                            int *info) {
+  std::vector<int> error_info(batch_size);
+  paddle::memory::Copy(phi::CPUPlace(),
+                       error_info.data(),
+                       dev_ctx.GetPlace(),
+                       info,
+                       sizeof(int) * batch_size,
+                       dev_ctx.stream());
+  dev_ctx.Wait();
+  for (auto i = 0; i < batch_size; ++i) {
+    CheckEighResult(i, error_info[i]);
+  }
+}
+#endif
+
 template <typename DeviceContext, typename T>
 struct MatrixEighFunctor {
   void operator()(const DeviceContext &dev_ctx,
@@ -95,7 +113,8 @@ struct MatrixEighFunctor<CPUContext, T> {
     char jobz = has_vectors ? 'V' : 'N';
     int n = dims[dim_size - 1];
     int64_t lda = std::max<int64_t>(1, n);
-    // if work = -1, it means that you need to use the lapack function to query
+    // if work = -1, it means that you need to use the lapack function to
+    // query
     // the optimal value
     int lwork = -1;      // The length of the array work
     int lrwork = -1;     // The dimension of the array rwork,rwork is REAL array
@@ -188,97 +207,92 @@ struct MatrixEighFunctor<GPUContext, T> {
                   bool is_lower,
                   bool has_vectors) {
     using ValueType = phi::dtype::Real<T>;
-    ValueType *out_value = dev_ctx.template Alloc<ValueType>(eigen_values);
 
-    DenseTensor input_trans;
-    input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input);
-    T *input_vector = input_trans.data<T>();
+    int workspace_size = 0;
     auto &dims = input.dims();
     int dim_size = dims.size();
     int64_t batch_size = GetBatchSize(dims);
+    int last_dim = dims[dim_size - 1];
+    int lda = std::max<int>(1, last_dim);
+    auto vector_stride = dims[dim_size - 1] * dims[dim_size - 2];
+    auto values_stride = dims[dim_size - 1];
 
     cublasFillMode_t uplo =
         is_lower ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
     cusolverEigMode_t jobz =
         has_vectors ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR;
 
-    int n = dims[dim_size - 1];
-    int lda = std::max<int>(1, n);
-    auto vector_stride = dims[dim_size - 1] * dims[dim_size - 2];
-    auto values_stride = dims[dim_size - 1];
-    int lwork = 0;
+    ValueType *out_value = dev_ctx.template Alloc<ValueType>(eigen_values);
     auto info = paddle::memory::Alloc(dev_ctx, sizeof(int) * batch_size);
     auto *info_ptr = reinterpret_cast<int *>(info->ptr());
 
-    // When the input type is float32, and the feature value input dimension
-    // is greater than or equal to [*,32,32]  and less than or equal to
-    // [*,512,512], Syevj has better performance.
+    DenseTensor input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input);
+    T *input_vector = input_trans.data<T>();
+
+    // Once input data type is float32, and the last dimension of
+    // input is located in range [32, 512], Syevj works better.
     bool use_syevj = (input.dtype() == phi::DataType::FLOAT32 &&
                       values_stride >= 32 && values_stride <= 512);
+    auto handle = dev_ctx.cusolver_dn_handle();
+
     syevjInfo_t syevj_params;
     if (use_syevj) {
       PADDLE_ENFORCE_GPU_SUCCESS(
           dynload::cusolverDnCreateSyevjInfo(&syevj_params));
+
       PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevj_bufferSize(
           dev_ctx.cusolver_dn_handle(),
           jobz,
           uplo,
-          n,
+          last_dim,
           reinterpret_cast<const float *>(input_vector),
           lda,
           reinterpret_cast<const float *>(out_value),
-          &lwork,
+          &workspace_size,
           syevj_params));
     } else {
       EvdBuffer(dev_ctx.cusolver_dn_handle(),
                 jobz,
                 uplo,
-                n,
+                last_dim,
                 input_vector,
                 lda,
                 out_value,
-                &lwork);
+                &workspace_size);
     }
-    auto work = paddle::memory::Alloc(dev_ctx, sizeof(T) * lwork);
+    auto work = paddle::memory::Alloc(dev_ctx, sizeof(T) * workspace_size);
     auto *work_ptr = reinterpret_cast<T *>(work->ptr());
-    for (auto i = 0; i < batch_size; i++) {
+
+    for (auto i = 0; i < batch_size; ++i) {
       auto *input_data = input_vector + i * vector_stride;
       auto *value_data = out_value + i * values_stride;
-      auto handle = dev_ctx.cusolver_dn_handle();
       if (use_syevj) {
         PADDLE_ENFORCE_GPU_SUCCESS(
             dynload::cusolverDnSsyevj(handle,
                                       jobz,
                                       uplo,
-                                      n,
+                                      last_dim,
                                       reinterpret_cast<float *>(input_data),
                                       lda,
                                       reinterpret_cast<float *>(value_data),
                                       reinterpret_cast<float *>(work_ptr),
-                                      lwork,
-                                      info_ptr,
+                                      workspace_size,
+                                      &info_ptr[i],
                                       syevj_params));
       } else {
         Evd(handle,
             jobz,
             uplo,
-            n,
+            last_dim,
             input_data,
             lda,
             value_data,
             work_ptr,
-            lwork,
-            info_ptr);
+            workspace_size,
+            &info_ptr[i]);
       }
-      int error_info = 0;
-      paddle::memory::Copy(phi::CPUPlace(),
-                           &error_info,
-                           dev_ctx.GetPlace(),
-                           info_ptr,
-                           sizeof(int),
-                           dev_ctx.stream());
-      CheckEighResult(i, error_info);
     }
+    CheckEighResult(dev_ctx, batch_size, info_ptr);
 
     if (use_syevj) {
       PADDLE_ENFORCE_GPU_SUCCESS(
diff --git a/paddle/phi/kernels/gpu/adam_kernel.cu b/paddle/phi/kernels/gpu/adam_kernel.cu
index 33b6f3a5a1bee..449aaae1a4be4 100644
--- a/paddle/phi/kernels/gpu/adam_kernel.cu
+++ b/paddle/phi/kernels/gpu/adam_kernel.cu
@@ -135,8 +135,8 @@ void AdamDenseKernel(const Context& dev_ctx,
                      const DenseTensor& moment2,
                      const DenseTensor& beta1_pow,
                      const DenseTensor& beta2_pow,
-                     paddle::optional<const DenseTensor&> master_param,
-                     paddle::optional<const DenseTensor&> skip_update,
+                     const paddle::optional<DenseTensor>& master_param,
+                     const paddle::optional<DenseTensor>& skip_update,
                      const Scalar& beta1,
                      const Scalar& beta2,
                      const Scalar& epsilon,
diff --git a/paddle/phi/kernels/gpu/adamw_kernel.cu b/paddle/phi/kernels/gpu/adamw_kernel.cu
index 4873ba9c13d48..0fff142567a5e 100644
--- a/paddle/phi/kernels/gpu/adamw_kernel.cu
+++ b/paddle/phi/kernels/gpu/adamw_kernel.cu
@@ -146,8 +146,8 @@ void AdamwDenseKernel(const Context& dev_ctx,
                       const DenseTensor& moment2,
                       const DenseTensor& beta1_pow,
                       const DenseTensor& beta2_pow,
-                      paddle::optional<const DenseTensor&> master_param,
-                      paddle::optional<const DenseTensor&> skip_update,
+                      const paddle::optional<DenseTensor>& master_param,
+                      const paddle::optional<DenseTensor>& skip_update,
                       const Scalar& beta1,
                       const Scalar& beta2,
                       const Scalar& epsilon,
diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
index ad3b8579ddf67..c08fa4eb260d4 100644
--- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -20,7 +20,7 @@
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 #include "paddle/fluid/operators/norm_utils.cu.h"
-#include "paddle/fluid/operators/norm_utils.h"
+#include "paddle/phi/kernels/funcs/norm_utils.h"
 
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/layout_utils.h"
@@ -309,11 +309,11 @@ void BatchNormGradRawKernel(const Context &ctx,
                             const DenseTensor &x,
                             const DenseTensor &scale,
                             const DenseTensor &bias,
-                            paddle::optional<const DenseTensor &> mean,
-                            paddle::optional<const DenseTensor &> variance,
+                            const paddle::optional<DenseTensor> &mean,
+                            const paddle::optional<DenseTensor> &variance,
                             const DenseTensor &saved_mean,
                             const DenseTensor &saved_variance,
-                            paddle::optional<const DenseTensor &> reserve_space,
+                            const paddle::optional<DenseTensor> &reserve_space,
                             const DenseTensor &y_grad,
                             float momentum,
                             float epsilon_f,
@@ -351,7 +351,7 @@ void BatchNormGradRawKernel(const Context &ctx,
           x_dims.size(),
           x_dims));
   int N, C, H, W, D;
-  paddle::operators::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
+  phi::funcs::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
 
   // init output
   if (d_x) {
@@ -867,11 +867,11 @@ void BatchNormGradKernel(const Context &dev_ctx,
                          const DenseTensor &x,
                          const DenseTensor &scale,
                          const DenseTensor &bias,
-                         paddle::optional<const DenseTensor &> mean,
-                         paddle::optional<const DenseTensor &> variance,
+                         const paddle::optional<DenseTensor> &mean,
+                         const paddle::optional<DenseTensor> &variance,
                          const DenseTensor &saved_mean,
                          const DenseTensor &saved_variance,
-                         paddle::optional<const DenseTensor &> reserve_space,
+                         const paddle::optional<DenseTensor> &reserve_space,
                          const DenseTensor &y_grad,
                          float momentum,
                          float epsilon,
@@ -910,8 +910,8 @@ template <typename T, typename Context>
 void BatchNormDoubleGradKernel(const Context &ctx,
                                const DenseTensor &x,
                                const DenseTensor &scale,
-                               paddle::optional<const DenseTensor &> mean,
-                               paddle::optional<const DenseTensor &> variance,
+                               const paddle::optional<DenseTensor> &mean,
+                               const paddle::optional<DenseTensor> &variance,
                                const DenseTensor &saved_mean,
                                const DenseTensor &saved_variance,
                                const DenseTensor &y_grad,
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 361e62e566035..e2aeec723628c 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -27,7 +27,7 @@ namespace cub = hipcub;
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
 #include "paddle/fluid/operators/norm_utils.cu.h"
-#include "paddle/fluid/operators/norm_utils.h"
+#include "paddle/phi/kernels/funcs/norm_utils.h"
 
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/layout_utils.h"
@@ -179,7 +179,7 @@ void BatchNormKernel(const Context &ctx,
 
   ctx.template Alloc<T>(y);
   int N, C, H, W, D;
-  paddle::operators::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
+  phi::funcs::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
 
   auto dtype = paddle::platform::CudnnDataType<T>::type;
 
diff --git a/paddle/phi/kernels/gpu/bincount_kernel.cu b/paddle/phi/kernels/gpu/bincount_kernel.cu
index a4ec894790cd3..8e60b31c3706b 100644
--- a/paddle/phi/kernels/gpu/bincount_kernel.cu
+++ b/paddle/phi/kernels/gpu/bincount_kernel.cu
@@ -49,7 +49,7 @@ __global__ void KernelBincount(const InputT* input,
 template <typename Context, typename T, typename InputT>
 void BincountCUDAInner(const Context& dev_ctx,
                        const DenseTensor& x,
-                       const paddle::optional<const DenseTensor&> weights,
+                       const paddle::optional<DenseTensor>& weights,
                        int minlength,
                        DenseTensor* out) {
   const DenseTensor* input = &x;
@@ -143,7 +143,7 @@ void BincountCUDAInner(const Context& dev_ctx,
 template <typename T, typename Context>
 void BincountKernel(const Context& dev_ctx,
                     const DenseTensor& x,
-                    const paddle::optional<const DenseTensor&> weights,
+                    const paddle::optional<DenseTensor>& weights,
                     int minlength,
                     DenseTensor* out) {
   if (x.dtype() == DataType::INT32) {
diff --git a/paddle/phi/kernels/gpu/dropout_kernel.cu b/paddle/phi/kernels/gpu/dropout_kernel.cu
index bd1683ad0c7d8..fae0e8cb25b5c 100644
--- a/paddle/phi/kernels/gpu/dropout_kernel.cu
+++ b/paddle/phi/kernels/gpu/dropout_kernel.cu
@@ -23,7 +23,7 @@ namespace phi {
 template <typename T, typename Context>
 void DropoutRawKernel(const Context& dev_ctx,
                       const DenseTensor& x,
-                      paddle::optional<const DenseTensor&> seed_tensor,
+                      const paddle::optional<DenseTensor>& seed_tensor,
                       float p,
                       bool is_test,
                       const std::string& mode,
diff --git a/paddle/phi/kernels/gpu/einsum_grad_kernel.cu b/paddle/phi/kernels/gpu/einsum_grad_kernel.cu
index c8a8745f34522..950f811475c99 100644
--- a/paddle/phi/kernels/gpu/einsum_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/einsum_grad_kernel.cu
@@ -18,5 +18,11 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/einsum_grad_impl.h"
 
-PD_REGISTER_KERNEL(
-    einsum_grad, GPU, ALL_LAYOUT, phi::EinsumGradKernel, float, double) {}
+PD_REGISTER_KERNEL(einsum_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::EinsumGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/einsum_kernel.cu b/paddle/phi/kernels/gpu/einsum_kernel.cu
index d73e154eb40f7..d1f4c6590387a 100644
--- a/paddle/phi/kernels/gpu/einsum_kernel.cu
+++ b/paddle/phi/kernels/gpu/einsum_kernel.cu
@@ -18,4 +18,11 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/einsum_impl.h"
 
-PD_REGISTER_KERNEL(einsum, GPU, ALL_LAYOUT, phi::EinsumKernel, float, double) {}
+PD_REGISTER_KERNEL(einsum,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::EinsumKernelRaw,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/elementwise_add_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_add_grad_kernel.cu
index 8dd4d0184c267..517fbcba158b8 100644
--- a/paddle/phi/kernels/gpu/elementwise_add_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_add_grad_kernel.cu
@@ -57,8 +57,8 @@ template <typename T, typename Context>
 void AddDoubleGradKernel(const Context& dev_ctx,
                          const DenseTensor& y,
                          const DenseTensor& dout,
-                         paddle::optional<const DenseTensor&> ddx,
-                         paddle::optional<const DenseTensor&> ddy,
+                         const paddle::optional<DenseTensor>& ddx,
+                         const paddle::optional<DenseTensor>& ddy,
                          int axis,
                          DenseTensor* ddout) {
   phi::AddDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
diff --git a/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu
index 017616df2782c..45e19b9838405 100644
--- a/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu
@@ -47,8 +47,8 @@ template <typename T, typename Context>
 void SubtractDoubleGradKernel(const Context& dev_ctx,
                               const DenseTensor& y,
                               const DenseTensor& dout,
-                              paddle::optional<const DenseTensor&> ddx,
-                              paddle::optional<const DenseTensor&> ddy,
+                              const paddle::optional<DenseTensor>& ddx,
+                              const paddle::optional<DenseTensor>& ddy,
                               int axis,
                               DenseTensor* ddout) {
   phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
diff --git a/paddle/phi/kernels/gpu/gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
index 96ebc0353ef24..b80634357d62f 100644
--- a/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
+++ b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
@@ -59,34 +59,20 @@ void GaussianRandomKernel(const Context& dev_ctx,
                           int seed,
                           DataType dtype,
                           DenseTensor* out) {
-  auto tensor = out;
-
-  bool seed_flag = false;
+  out->Resize(phi::make_ddim(shape.GetData()));
+  dev_ctx.template Alloc<T>(out);
   if (seed == 0) {
-    std::random_device rd;
-    seed = rd();
-    seed_flag = true;
-  }
-
-  tensor->Resize(phi::make_ddim(shape.GetData()));
-
-  T* data = dev_ctx.template Alloc<T>(tensor);
-
-  int64_t size = tensor->numel();
-
-  int device_id = dev_ctx.GetPlace().GetDeviceId();
-  auto gen_cuda = paddle::framework::GetDefaultCUDAGenerator(device_id);
-
-  if (gen_cuda->GetIsInitPy() && seed_flag) {
+    // use global Generator seed
     using MT = typename phi::dtype::MPTypeTrait<T>::Type;
     funcs::normal_distribution<MT> dist;
     funcs::normal_transform<MT> trans(static_cast<MT>(mean),
                                       static_cast<MT>(std));
-    funcs::distribution_and_transform<T>(dev_ctx, tensor, dist, trans);
+    funcs::distribution_and_transform<T>(dev_ctx, out, dist, trans);
   } else {
+    // use OP seed
     auto func =
         GaussianGenerator<T>(static_cast<T>(mean), static_cast<T>(std), seed);
-    IndexKernel<T, GaussianGenerator<T>>(dev_ctx, tensor, func);
+    IndexKernel<T, GaussianGenerator<T>>(dev_ctx, out, func);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
index 34bd1d6db77da..9869d5a517bcb 100644
--- a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
@@ -286,8 +286,8 @@ void GraphReindexKernel(const Context& dev_ctx,
                         const DenseTensor& x,
                         const DenseTensor& neighbors,
                         const DenseTensor& count,
-                        paddle::optional<const DenseTensor&> hashtable_value,
-                        paddle::optional<const DenseTensor&> hashtable_index,
+                        const paddle::optional<DenseTensor>& hashtable_value,
+                        const paddle::optional<DenseTensor>& hashtable_index,
                         bool flag_buffer_hashtable,
                         DenseTensor* reindex_src,
                         DenseTensor* reindex_dst,
diff --git a/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu b/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
index af616963b499a..174495dad34b2 100644
--- a/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
@@ -356,8 +356,8 @@ void GraphSampleNeighborsKernel(
     const DenseTensor& row,
     const DenseTensor& col_ptr,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> eids,
-    paddle::optional<const DenseTensor&> perm_buffer,
+    const paddle::optional<DenseTensor>& eids,
+    const paddle::optional<DenseTensor>& perm_buffer,
     int sample_size,
     bool return_eids,
     bool flag_perm_buffer,
diff --git a/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu
index 2be0caff79d64..8743b4e8a7408 100644
--- a/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu
@@ -105,8 +105,8 @@ void GraphSendRecvGradKernel(const Context& ctx,
                              const DenseTensor& x,
                              const DenseTensor& src_index,
                              const DenseTensor& dst_index,
-                             paddle::optional<const DenseTensor&> out,
-                             paddle::optional<const DenseTensor&> dst_count,
+                             const paddle::optional<DenseTensor>& out,
+                             const paddle::optional<DenseTensor>& dst_count,
                              const DenseTensor& out_grad,
                              const std::string& pool_type,
                              DenseTensor* x_grad) {
diff --git a/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
index 6b1e58981baa0..c0e557f09bcc9 100644
--- a/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
+++ b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
@@ -27,12 +27,9 @@
 namespace cub = hipcub;
 #endif
 
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-#include <thrust/random.h>
-#include <thrust/transform.h>
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace phi {
@@ -144,27 +141,21 @@ struct GumbleNoiseGenerator<GPUContext, T> {
     DenseTensor random_tensor;
     int64_t size = size_to_axis * size_from_axis;
     random_tensor.Resize(make_ddim({size}));
-    auto* random_data = ctx.template Alloc<T>(&random_tensor);
-    thrust::counting_iterator<int64_t> index_sequence_begin(0);
+    T* random_data = ctx.template Alloc<T>(&random_tensor);
 
     // generate gumbel noise
     int device_id = ctx.GetPlace().GetDeviceId();
-    auto gen_cuda = paddle::framework::GetDefaultCUDAGenerator(device_id);
-    if (gen_cuda->GetIsInitPy()) {
-      auto seed_offset = gen_cuda->IncrementOffset(1);
-      int64_t gen_offset = size * seed_offset.second;
-      thrust::transform(
-          index_sequence_begin,
-          index_sequence_begin + size,
-          thrust::device_ptr<T>(random_data),
-          UniformCUDAGenerator<T>(0.00001, 1, seed_offset.first, gen_offset));
-    } else {
-      const unsigned int seed = std::random_device()();
-      thrust::transform(index_sequence_begin,
-                        index_sequence_begin + size,
-                        thrust::device_ptr<T>(random_data),
-                        UniformCUDAGenerator<T>(0.00001, 1, seed));
-    }
+    auto gen_cuda = ctx.GetGenerator();
+
+    auto seed_offset = gen_cuda->IncrementOffset(1);
+    uint64_t seed = seed_offset.first;
+    uint64_t offset = seed_offset.second;
+
+    thrust::counting_iterator<int64_t> index_sequence_begin(0);
+    thrust::transform(index_sequence_begin,
+                      index_sequence_begin + size,
+                      thrust::device_ptr<T>(random_data),
+                      UniformCUDAGenerator<T>(0.00001, 1, seed, size * offset));
 
     // add gumbel noise to X
     const int thread_size = 512;
diff --git a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
new file mode 100644
index 0000000000000..b72acc7073383
--- /dev/null
+++ b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
@@ -0,0 +1,642 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/instance_norm_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/norm_utils.h"
+#include "paddle/phi/kernels/gpu/instance_norm_utils.h"
+
+namespace phi {
+template <typename T, int BlockDim>
+static __global__ void GradComputeDX(const T *dy,
+                                     const BatchNormParamType<T> *scale,
+                                     const BatchNormParamType<T> *mean,
+                                     const T *x,
+                                     const BatchNormParamType<T> *variance,
+                                     const int C,
+                                     const int sample_size,
+                                     T *dx) {
+  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
+  int end_idx = (blockIdx.x + 1) * sample_size;
+  int ncid = blockIdx.x;
+  int c = ncid % C;
+  BatchNormParamType<T> mean_val = mean[ncid];
+  BatchNormParamType<T> inv_var_val = variance[ncid];
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage dy_storage;
+  __shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage;
+  __shared__ BatchNormParamType<T> dy_sum_val;
+  __shared__ BatchNormParamType<T> dy_x_sub_mean_sum_val;
+  BatchNormParamType<T> dy_sum = static_cast<BatchNormParamType<T>>(0);
+  BatchNormParamType<T> dy_x_sub_mean_sum =
+      static_cast<BatchNormParamType<T>>(0);
+
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    BatchNormParamType<T> dy_i = static_cast<BatchNormParamType<T>>(dy[i]);
+    dy_sum += dy_i;
+    dy_x_sub_mean_sum +=
+        dy_i * (static_cast<BatchNormParamType<T>>(x[i]) - mean_val);
+  }
+  dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
+  dy_x_sub_mean_sum =
+      BlockReduce(dy_x_sub_mean_storage).Reduce(dy_x_sub_mean_sum, cub::Sum());
+  if (threadIdx.x == 0) {
+    dy_sum_val = dy_sum;
+    dy_x_sub_mean_sum_val = dy_x_sub_mean_sum;
+  }
+  __syncthreads();
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    dx[i] =
+        (static_cast<BatchNormParamType<T>>(dy[i]) -
+         dy_sum_val / static_cast<BatchNormParamType<T>>(sample_size) -
+         (static_cast<BatchNormParamType<T>>(x[i]) - mean_val) *
+             dy_x_sub_mean_sum_val * inv_var_val * inv_var_val / sample_size) *
+        scale[c] * inv_var_val;
+  }
+}
+
+static __device__ __forceinline__ float real_sqrt(float x) {
+  return 1. / sqrtf(x);
+}
+static __device__ __forceinline__ double real_sqrt(double x) {
+  return 1. / sqrt(x);
+}
+
+template <typename T, int BlockDim>
+__global__ void DoubleGradComputeDX(const T *x,
+                                    const T *mean,
+                                    const T *variance,
+                                    const T *ddx,
+                                    const T *dy,
+                                    const T *scale,
+                                    const T *ddscale,
+                                    int C,
+                                    int sample_size,
+                                    const double epsilon,
+                                    T *dx) {
+  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
+  int end_idx = (blockIdx.x + 1) * sample_size;
+  int ncid = blockIdx.x;
+  int c = ncid % C;
+
+  T mean_val = mean[ncid];
+  T var_val = variance[ncid];
+
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage dy_storage;
+  __shared__ typename BlockReduce::TempStorage ddx_storage;
+  __shared__ typename BlockReduce::TempStorage dy_mul_ddx_storage;
+  __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage;
+  __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage;
+  __shared__ T dy_sum_val;
+  __shared__ T ddx_sum_val;
+  __shared__ T dy_mul_ddx_sum_val;
+  __shared__ T dy_mul_x_sub_mean_sum_val;
+  __shared__ T ddx_mul_x_sub_mean_sum_val;
+
+  T dy_sum = 0;
+  T ddx_sum = 0;
+  T dy_mul_ddx_sum = 0;
+  T dy_mul_x_sub_mean_sum = 0;
+  T ddx_mul_x_sub_mean_sum = 0;
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    T ddx_i = ddx[i];
+    T dy_i = dy[i];
+    T tmp = x[i] - mean_val;
+
+    dy_sum += dy_i;
+    ddx_sum += ddx_i;
+    dy_mul_ddx_sum += (ddx_i * dy_i);
+
+    dy_mul_x_sub_mean_sum += (dy_i * tmp);
+    ddx_mul_x_sub_mean_sum += (ddx_i * tmp);
+  }
+
+  dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
+  ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum());
+  dy_mul_ddx_sum =
+      BlockReduce(dy_mul_ddx_storage).Reduce(dy_mul_ddx_sum, cub::Sum());
+  dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage)
+                              .Reduce(dy_mul_x_sub_mean_sum, cub::Sum());
+  ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage)
+                               .Reduce(ddx_mul_x_sub_mean_sum, cub::Sum());
+
+  if (threadIdx.x == 0) {
+    dy_sum_val = dy_sum;
+    ddx_sum_val = ddx_sum;
+    dy_mul_ddx_sum_val = dy_mul_ddx_sum;
+    dy_mul_x_sub_mean_sum_val = dy_mul_x_sub_mean_sum;
+    ddx_mul_x_sub_mean_sum_val = ddx_mul_x_sub_mean_sum;
+  }
+  __syncthreads();
+
+  if (ddx != nullptr) {
+    for (int i = beg_idx; i < end_idx; i += BlockDim) {
+      dx[i] +=
+          ((x[i] - mean_val) * var_val * var_val * var_val / sample_size *
+               (ddx_sum_val * dy_sum_val / sample_size - dy_mul_ddx_sum_val +
+                3. * dy_mul_x_sub_mean_sum_val * var_val *
+                    ddx_mul_x_sub_mean_sum_val * var_val / sample_size) +
+           ddx_mul_x_sub_mean_sum_val * var_val / sample_size * var_val *
+               var_val * (dy_sum_val / sample_size - dy[i]) +
+           dy_mul_x_sub_mean_sum_val * var_val / sample_size * var_val *
+               var_val * (ddx_sum_val / sample_size - ddx[i])) *
+          scale[c];
+    }
+  }
+  __syncthreads();
+  if (ddscale != nullptr) {
+    for (int i = beg_idx; i < end_idx; i += BlockDim) {
+      dx[i] += (dy[i] * var_val - dy_sum_val / sample_size * var_val -
+                (x[i] - mean_val) * var_val * dy_mul_x_sub_mean_sum_val *
+                    var_val / sample_size) *
+               ddscale[c];
+    }
+  }
+}
+
+template <typename T, int BlockDim>
+__global__ void DoubleGradComputeDDY(const T *x,
+                                     const T *mean,
+                                     const T *variance,
+                                     const T *ddscale,
+                                     const T *ddbias,
+                                     const T *ddx,
+                                     const T *scale,
+                                     int C,
+                                     int sample_size,
+                                     const double epsilon,
+                                     T *ddy) {
+  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
+  int end_idx = (blockIdx.x + 1) * sample_size;
+  int ncid = blockIdx.x;
+  int c = ncid % C;
+  T mean_val = mean[ncid];
+  T var_val = variance[ncid];
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ddx_storage;
+  __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage;
+  __shared__ T ddx_sum_val;
+  __shared__ T ddx_mul_x_sub_mean_sum_val;
+
+  T ddx_sum = 0;
+  T ddx_mul_x_sub_mean_sum = 0;
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    T ddx_i = ddx[i];
+    ddx_sum += ddx_i;
+    ddx_mul_x_sub_mean_sum += (ddx_i * (x[i] - mean_val));
+  }
+  ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum());
+  ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage)
+                               .Reduce(ddx_mul_x_sub_mean_sum, cub::Sum());
+  if (threadIdx.x == 0) {
+    ddx_sum_val = ddx_sum;
+    ddx_mul_x_sub_mean_sum_val = ddx_mul_x_sub_mean_sum;
+  }
+  __syncthreads();
+  if (ddx != nullptr) {
+    for (int i = beg_idx; i < end_idx; i += BlockDim) {
+      ddy[i] += scale[c] * var_val *
+                (ddx[i] - ddx_sum_val / sample_size -
+                 (x[i] - mean_val) * var_val * ddx_mul_x_sub_mean_sum_val *
+                     var_val / sample_size);
+    }
+  }
+  __syncthreads();
+  if (ddscale != nullptr) {
+    for (int i = beg_idx; i < end_idx; i += BlockDim) {
+      ddy[i] += (x[i] - mean_val) * var_val * ddscale[c];
+    }
+  }
+  __syncthreads();
+  if (ddbias != nullptr) {
+    for (int i = beg_idx; i < end_idx; i += BlockDim) {
+      ddy[i] += ddbias[c];
+    }
+  }
+}
+
+template <typename T, int BlockDim>
+__global__ void DoubleGradComputeDScale(const T *x,
+                                        const T *mean,
+                                        const T *variance,
+                                        const T *ddx,
+                                        const T *dy,
+                                        int C,
+                                        int sample_size,
+                                        const double epsilon,
+                                        T *dscale) {
+  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
+  int end_idx = (blockIdx.x + 1) * sample_size;
+  int ncid = blockIdx.x;
+  int c = ncid % C;
+  T mean_val = mean[ncid];
+  T var_val = variance[ncid];
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage dy_storage;
+  __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage;
+  __shared__ typename BlockReduce::TempStorage dscale_tmp_storage;
+  __shared__ T dy_sum_val;
+  __shared__ T dy_mul_x_sub_mean_sum_val;
+
+  T dy_sum = 0;
+  T dy_mul_x_sub_mean_sum = 0;
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    T dy_i = dy[i];
+    dy_sum += dy_i;
+    dy_mul_x_sub_mean_sum += (dy_i * (x[i] - mean_val));
+  }
+  dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
+  dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage)
+                              .Reduce(dy_mul_x_sub_mean_sum, cub::Sum());
+
+  if (threadIdx.x == 0) {
+    dy_sum_val = dy_sum;
+    dy_mul_x_sub_mean_sum_val = dy_mul_x_sub_mean_sum;
+  }
+  __syncthreads();
+  if (ddx != nullptr) {
+    T dscale_tmp = 0;
+    for (int i = beg_idx; i < end_idx; i += BlockDim) {
+      dscale_tmp +=
+          ddx[i] * var_val * (dy[i] - dy_sum_val / sample_size -
+                              dy_mul_x_sub_mean_sum_val * (x[i] - mean_val) *
+                                  var_val * var_val / sample_size);
+    }
+    dscale_tmp = BlockReduce(dscale_tmp_storage).Reduce(dscale_tmp, cub::Sum());
+    if (threadIdx.x == 0) {
+      dscale[ncid] += dscale_tmp;
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T, typename Context>
+void InstanceNormGradKernel(const Context &dev_ctx,
+                            const DenseTensor &x,
+                            const DenseTensor &d_y,
+                            const paddle::optional<DenseTensor> &scale,
+                            const DenseTensor &saved_mean,
+                            const DenseTensor &saved_variance,
+                            float epsilon_f,
+                            DenseTensor *d_x,
+                            DenseTensor *d_scale,
+                            DenseTensor *d_bias) {
+  double epsilon = static_cast<double>(epsilon_f);
+  const auto *scale_ptr = scale.get_ptr();
+
+  const auto &x_dims = x.dims();
+
+  int N, C, H, W, D;
+  funcs::ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
+  int NxC = N * C;
+
+  DenseTensor x_tmp, d_y_tmp;
+  x_tmp.ShareDataWith(x).Resize({1, NxC, H, W, D});
+  d_y_tmp.ShareDataWith(d_y).Resize({1, NxC, H, W, D});
+
+  dev_ctx.template Alloc<T>(d_x);
+  if (d_scale && d_bias) {
+    dev_ctx.template Alloc<T>(d_scale);
+    dev_ctx.template Alloc<T>(d_bias);
+  }
+  if (scale_ptr) {
+    PADDLE_ENFORCE_EQ(
+        scale_ptr->dims().size(),
+        1UL,
+        phi::errors::InvalidArgument(
+            "The `shape` in InstanceNormOp is invalid: "
+            "the size of scale's dimensions must be equal to 1. But "
+            "received: the size of scale's dimensions"
+            "is [%d]",
+            scale_ptr->dims().size()));
+    PADDLE_ENFORCE_EQ(scale_ptr->dims()[0],
+                      C,
+                      phi::errors::InvalidArgument(
+                          "The `shape` in InstanceNormOp is invalid: "
+                          "the first dimension of scale must be equal to "
+                          "Channels([%d]). But received: "
+                          "the first dimension of scale is [%d],"
+                          "the dimensions of scale is [%s], ",
+                          C,
+                          scale_ptr->dims()[0],
+                          scale_ptr->dims()));
+  }
+
+  phi::funcs::SetConstant<GPUContext, T> set_constant;
+
+  const int n = x.numel();
+  const int block = 512;
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(max_threads / block, 1);
+  const int grid = std::min(NxC, max_blocks);
+  const int grid1 = (C + block - 1) / block;
+
+  DenseTensor scale_tmp;
+  scale_tmp.Resize({NxC});
+  dev_ctx.template Alloc<T>(&scale_tmp);
+
+  DenseTensor d_scale_tmp;
+  d_scale_tmp.Resize({NxC});
+  dev_ctx.template Alloc<T>(&d_scale_tmp);
+
+  DenseTensor d_bias_tmp;
+  d_bias_tmp.Resize({NxC});
+  dev_ctx.template Alloc<T>(&d_bias_tmp);
+
+  if (scale_ptr) {
+    repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
+        scale_ptr->data<T>(), scale_tmp.data<T>(), N, C);
+  } else {
+    set_constant(dev_ctx, &scale_tmp, static_cast<T>(1));
+  }
+
+  std::vector<int> dims;
+  std::vector<int> strides;
+  dims = {1, NxC, H, W, D};
+  strides = {NxC * H * W * D, H * W * D, W * D, D, 1};
+
+  if ((H * W * D) == 1) {
+    phi::Copy(dev_ctx, d_y, dev_ctx.GetPlace(), false, d_x);
+    phi::funcs::SetConstant<GPUContext, BatchNormParamType<T>> functor;
+    functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
+    functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
+    return;
+  }
+
+#ifdef PADDLE_WITH_HIP
+  miopenTensorDescriptor_t data_desc_;
+  miopenTensorDescriptor_t in_param_desc_;
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
+#else
+  cudnnTensorDescriptor_t data_desc_;
+  cudnnTensorDescriptor_t in_param_desc_;
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
+#endif
+
+  if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+    LOG(ERROR) << "Provided epsilon is smaller than "
+               << "CUDNN_BN_MIN_EPSILON. Setting it to "
+               << "CUDNN_BN_MIN_EPSILON instead.";
+  }
+  epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenSetTensorDescriptor(
+          data_desc_,
+          CudnnDataType<T>::type,
+          x_dims.size() > 3 ? x_dims.size() : 4,
+          const_cast<int *>(dims.data()),
+          const_cast<int *>(strides.data())));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenDeriveBNTensorDescriptor(
+          in_param_desc_, data_desc_, miopenBNSpatial));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnSetTensorNdDescriptor(
+          data_desc_,
+          CudnnDataType<T>::type,
+          x_dims.size() > 3 ? x_dims.size() : 4,
+          dims.data(),
+          strides.data()));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnDeriveBNTensorDescriptor(
+          in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
+#endif
+
+  const auto *saved_mean_data =
+      saved_mean.template data<BatchNormParamType<T>>();
+  const auto *saved_var_data =
+      saved_variance.template data<BatchNormParamType<T>>();
+  if (d_scale && d_bias) {
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::miopenBatchNormalizationBackward(
+            dev_ctx.cudnn_handle(),
+            miopenBNSpatial,
+            CudnnDataType<T>::kOne(),
+            CudnnDataType<T>::kZero(),
+            CudnnDataType<T>::kOne(),
+            CudnnDataType<T>::kZero(),
+            data_desc_,
+            x_tmp.template data<T>(),
+            data_desc_,
+            d_y_tmp.template data<T>(),
+            data_desc_,
+            d_x->template data<T>(),
+            in_param_desc_,
+            scale_tmp.template data<BatchNormParamType<T>>(),
+            d_scale_tmp.template data<BatchNormParamType<T>>(),
+            d_bias_tmp.template data<BatchNormParamType<T>>(),
+            epsilon,
+            saved_mean_data,
+            saved_var_data));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnBatchNormalizationBackward(
+            dev_ctx.cudnn_handle(),
+            CUDNN_BATCHNORM_SPATIAL,
+            CudnnDataType<T>::kOne(),
+            CudnnDataType<T>::kZero(),
+            CudnnDataType<T>::kOne(),
+            CudnnDataType<T>::kZero(),
+            data_desc_,
+            x_tmp.template data<T>(),
+            data_desc_,
+            d_y_tmp.template data<T>(),
+            data_desc_,
+            d_x->template data<T>(),
+            in_param_desc_,
+            scale_tmp.template data<BatchNormParamType<T>>(),
+            d_scale_tmp.template data<BatchNormParamType<T>>(),
+            d_bias_tmp.template data<BatchNormParamType<T>>(),
+            epsilon,
+            saved_mean_data,
+            saved_var_data));
+#endif
+  } else {
+    if (d_x) {
+      GradComputeDX<T, block><<<NxC, block, 0, dev_ctx.stream()>>>(
+          d_y.data<T>(),
+          scale_tmp.data<BatchNormParamType<T>>(),
+          saved_mean_data,
+          x.data<T>(),
+          saved_var_data,
+          C,
+          H * W * D,
+          d_x->data<T>());
+    }
+  }
+
+  if (d_scale && d_bias) {
+    add_param<T, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
+        d_scale_tmp.data<T>(), d_scale->data<T>(), N, C);
+    add_param<T, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
+        d_bias_tmp.data<T>(), d_bias->data<T>(), N, C);
+  }
+
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
+#endif
+}
+
+template <typename T, typename Context>
+void InstanceNormDoubleGradKernel(const Context &dev_ctx,
+                                  const DenseTensor &x,
+                                  const paddle::optional<DenseTensor> &scale,
+                                  const DenseTensor &saved_mean,
+                                  const DenseTensor &saved_variance,
+                                  const DenseTensor &dy,
+                                  const paddle::optional<DenseTensor> &ddx,
+                                  const paddle::optional<DenseTensor> &ddscale,
+                                  const paddle::optional<DenseTensor> &ddbias,
+                                  float epsilon_f,
+                                  DenseTensor *dx,
+                                  DenseTensor *dscale,
+                                  DenseTensor *ddy) {
+  const auto *Scale = scale.get_ptr();
+  const auto *ddX = ddx.get_ptr();
+  const auto *ddScale = ddscale.get_ptr();
+  const auto *ddBias = ddbias.get_ptr();
+  const double epsilon = static_cast<double>(epsilon_f);
+  const T *x_data = x.data<T>();
+  const T *dy_data = dy.data<T>();
+  const T *ddx_data = (ddX == nullptr ? nullptr : ddX->data<T>());
+  const T *ddscale_data = (ddScale == nullptr ? nullptr : ddScale->data<T>());
+  const T *ddbias_data = (ddScale == nullptr ? nullptr : ddBias->data<T>());
+  const T *mean_data = saved_mean.data<T>();
+  const T *variance_data = saved_variance.data<T>();
+  phi::funcs::SetConstant<GPUContext, T> set_zero;
+  auto &x_dims = x.dims();
+  int N, C, H, W, D;
+  funcs::ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
+  int NxC = N * C;
+  const int n = x.numel();
+  int sample_size = n / N / C;
+
+  DenseTensor scale_tmp;
+  if (!Scale) {
+    scale_tmp.Resize({C});
+    dev_ctx.template Alloc<T>(&scale_tmp);
+    set_zero(dev_ctx, &scale_tmp, static_cast<T>(1));
+  }
+  const T *scale_data = Scale ? Scale->data<T>() : scale_tmp.data<T>();
+  const int block = 512;
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(max_threads / block, 1);
+  const int grid = NxC;
+  const int grid1 = (C + block - 1) / block;
+
+  if (dx) {
+    T *dx_data = dev_ctx.template Alloc<T>(dx);
+    set_zero(dev_ctx, dx, static_cast<T>(0));
+    DoubleGradComputeDX<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
+        x_data,
+        mean_data,
+        variance_data,
+        ddx_data,
+        dy_data,
+        scale_data,
+        ddscale_data,
+        C,
+        sample_size,
+        epsilon,
+        dx_data);
+  }
+  if (dscale) {
+    DenseTensor dscale_tmp;
+    dscale_tmp.Resize({NxC});
+    dev_ctx.template Alloc<T>(&dscale_tmp);
+    set_zero(dev_ctx, &dscale_tmp, static_cast<T>(0));
+    T *dscale_tmp_data = dscale_tmp.data<T>();
+
+    T *dscale_data = dev_ctx.template Alloc<T>(dscale);
+    set_zero(dev_ctx, dscale, static_cast<T>(0));
+    DoubleGradComputeDScale<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
+        x_data,
+        mean_data,
+        variance_data,
+        ddx_data,
+        dy_data,
+        C,
+        sample_size,
+        epsilon,
+        dscale_tmp_data);
+    add_param<T, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
+        dscale_tmp.data<T>(), dscale->data<T>(), N, C);
+  }
+  if (ddy) {
+    T *ddy_data = dev_ctx.template Alloc<T>(ddy);
+    set_zero(dev_ctx, ddy, static_cast<T>(0));
+    DoubleGradComputeDDY<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
+        x_data,
+        mean_data,
+        variance_data,
+        ddscale_data,
+        ddbias_data,
+        ddx_data,
+        scale_data,
+        C,
+        sample_size,
+        epsilon,
+        ddy_data);
+  }
+}
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+// MIOPEN do not support double
+PD_REGISTER_KERNEL(
+    instance_norm_grad, GPU, ALL_LAYOUT, phi::InstanceNormGradKernel, float) {}
+PD_REGISTER_KERNEL(instance_norm_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::InstanceNormDoubleGradKernel,
+                   float) {}
+#else
+PD_REGISTER_KERNEL(instance_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::InstanceNormGradKernel,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(instance_norm_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::InstanceNormDoubleGradKernel,
+                   float,
+                   double) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/instance_norm_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_kernel.cu
new file mode 100644
index 0000000000000..b729223689809
--- /dev/null
+++ b/paddle/phi/kernels/gpu/instance_norm_kernel.cu
@@ -0,0 +1,220 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/instance_norm_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/norm_utils.h"
+#include "paddle/phi/kernels/gpu/instance_norm_utils.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void InstanceNormKernel(const Context &dev_ctx,
+                        const DenseTensor &x,
+                        const paddle::optional<DenseTensor> &scale,
+                        const paddle::optional<DenseTensor> &bias,
+                        float epsilon_f,
+                        DenseTensor *y,
+                        DenseTensor *saved_mean,
+                        DenseTensor *saved_variance) {
+  double epsilon = static_cast<double>(epsilon_f);
+  auto &x_dims = x.dims();
+  PADDLE_ENFORCE_GE(x_dims.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The `shape` in InstanceNormOp is invalid: "
+                        "the size of X's dimensions must greater than "
+                        "or equal to 2. But received: "
+                        "the size of X's dimensions is [%d]",
+                        x_dims.size()));
+  PADDLE_ENFORCE_LE(x_dims.size(),
+                    5,
+                    phi::errors::InvalidArgument(
+                        "The `shape` in InstanceNormOp is invalid: "
+                        "the size of X's dimensions must smaller than"
+                        "or equal to 5. But received: "
+                        "the size of X's dimensions is [%d]",
+                        x_dims.size()));
+  int N, C, H, W, D;
+  funcs::ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
+  int NxC = N * C;
+  DenseTensor x_tmp;
+  x_tmp.ShareDataWith(x).Resize({1, NxC, H, W, D});
+  dev_ctx.template Alloc<T>(y);
+
+#ifdef PADDLE_WITH_HIP
+  miopenTensorDescriptor_t data_desc_;
+  miopenTensorDescriptor_t in_param_desc_;
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
+#else
+  cudnnTensorDescriptor_t data_desc_;
+  cudnnTensorDescriptor_t in_param_desc_;
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
+#endif
+  if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+    LOG(ERROR) << "Provided epsilon is smaller than "
+               << "CUDNN_BN_MIN_EPSILON. Setting it to "
+               << "CUDNN_BN_MIN_EPSILON instead.";
+  }
+  epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+
+  VLOG(3) << "Setting descriptors.";
+  std::vector<int> dims;
+  std::vector<int> strides;
+  dims = {1, NxC, H, W, D};
+  strides = {NxC * H * W * D, H * W * D, W * D, D, 1};
+
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenSetTensorDescriptor(
+          data_desc_,
+          CudnnDataType<T>::type,
+          x_dims.size() > 3 ? x_dims.size() : 4,
+          const_cast<int *>(dims.data()),
+          const_cast<int *>(strides.data())));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenDeriveBNTensorDescriptor(
+          in_param_desc_, data_desc_, miopenBNSpatial));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnSetTensorNdDescriptor(
+          data_desc_,
+          CudnnDataType<T>::type,
+          x_dims.size() > 3 ? x_dims.size() : 4,
+          dims.data(),
+          strides.data()));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnDeriveBNTensorDescriptor(
+          in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
+#endif
+
+  const auto scale_ptr = scale.get_ptr();
+  const auto bias_ptr = bias.get_ptr();
+
+  DenseTensor scale_tmp;
+  scale_tmp.Resize({NxC});
+  dev_ctx.template Alloc<T>(&scale_tmp);
+  DenseTensor bias_tmp;
+  bias_tmp.Resize({NxC});
+  dev_ctx.template Alloc<T>(&bias_tmp);
+
+  const int n = x.numel();
+  const int block = 512;
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(max_threads / block, 1);
+  const int grid = std::min((NxC + block - 1) / block, max_blocks);
+
+  phi::funcs::SetConstant<GPUContext, T> set_constant;
+  if (scale_ptr) {
+    repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
+        scale_ptr->data<T>(), scale_tmp.data<T>(), N, C);
+  } else {
+    set_constant(dev_ctx, &scale_tmp, static_cast<T>(1));
+  }
+  if (bias_ptr) {
+    repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
+        bias_ptr->data<T>(), bias_tmp.data<T>(), N, C);
+  } else {
+    set_constant(dev_ctx, &bias_tmp, static_cast<T>(0));
+  }
+
+  auto handle = dev_ctx.cudnn_handle();
+
+  phi::funcs::SetConstant<GPUContext, BatchNormParamType<T>> functor;
+  dev_ctx.template Alloc<BatchNormParamType<T>>(saved_mean);
+  dev_ctx.template Alloc<BatchNormParamType<T>>(saved_variance);
+  functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
+  functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
+
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenBatchNormalizationForwardTraining(
+          handle,
+          miopenBNSpatial,
+          const_cast<void *>(
+              static_cast<const void *>(CudnnDataType<T>::kOne())),
+          const_cast<void *>(
+              static_cast<const void *>(CudnnDataType<T>::kZero())),
+          data_desc_,
+          static_cast<const void *>(x_tmp.template data<T>()),
+          data_desc_,
+          static_cast<void *>(y->template data<T>()),
+          in_param_desc_,
+          const_cast<void *>(static_cast<const void *>(
+              scale_tmp.template data<BatchNormParamType<T>>())),
+          const_cast<void *>(static_cast<const void *>(
+              bias_tmp.template data<BatchNormParamType<T>>())),
+          0,
+          nullptr,
+          nullptr,
+          epsilon,
+          static_cast<void *>(
+              saved_mean->template data<BatchNormParamType<T>>()),
+          static_cast<void *>(
+              saved_variance->template data<BatchNormParamType<T>>())));
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnBatchNormalizationForwardTraining(
+          handle,
+          CUDNN_BATCHNORM_SPATIAL,
+          CudnnDataType<T>::kOne(),
+          CudnnDataType<T>::kZero(),
+          data_desc_,
+          x_tmp.template data<T>(),
+          data_desc_,
+          y->template data<T>(),
+          in_param_desc_,
+          scale_tmp.template data<BatchNormParamType<T>>(),
+          bias_tmp.template data<BatchNormParamType<T>>(),
+          0,
+          nullptr,
+          nullptr,
+          epsilon,
+          saved_mean->template data<BatchNormParamType<T>>(),
+          saved_variance->template data<BatchNormParamType<T>>()));
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
+#endif
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+// MIOPEN do not support double
+PD_REGISTER_KERNEL(
+    instance_norm, GPU, ALL_LAYOUT, phi::InstanceNormKernel, float) {}
+#else
+PD_REGISTER_KERNEL(
+    instance_norm, GPU, ALL_LAYOUT, phi::InstanceNormKernel, float, double) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/instance_norm_utils.h b/paddle/phi/kernels/gpu/instance_norm_utils.h
new file mode 100644
index 0000000000000..50dfe4ad222c0
--- /dev/null
+++ b/paddle/phi/kernels/gpu/instance_norm_utils.h
@@ -0,0 +1,73 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cfloat>
+#include <string>
+#include <vector>
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+
+namespace phi {
+
+template <typename T>
+using CudnnDataType = paddle::platform::CudnnDataType<T>;
+template <typename T>
+using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
+
+template <typename T>
+static __global__ void repeat_param(const T *input,
+                                    T *output,
+                                    const int repeat_num,
+                                    const int C) {
+  CUDA_KERNEL_LOOP(i, repeat_num * C) {
+    int index = i % C;
+    output[i] = input[index];
+  }
+}
+
+template <typename T, int BlockDim, bool AVG>
+static __global__ void add_param(const T *input,
+                                 T *output,
+                                 const int repeat_num,
+                                 const int C) {
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ou_storage;
+  for (int i = blockIdx.x; i < C; i += gridDim.x) {
+    T ou = static_cast<T>(0);
+    for (int j = threadIdx.x; j < repeat_num; j += blockDim.x) {
+      const int index = j * C + i;
+      ou += static_cast<T>(input[index]);
+    }
+    ou = BlockReduce(ou_storage).Reduce(ou, cub::Sum());
+    if (threadIdx.x == 0) {
+      output[i] = ou;
+    }
+    __syncthreads();
+
+    if (AVG) {
+      output[i] /= repeat_num;
+    }
+  }
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
index 73334d9c38aa3..cd0f4e1493e5c 100644
--- a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
@@ -747,9 +747,9 @@ template <typename T, typename Context>
 static void Interpolate1DCUDABwd(
     const Context& dev_ctx,
     const DenseTensor& input,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& output_grad,
     const std::string& data_layout_str,
     int out_w,
@@ -861,9 +861,9 @@ template <typename T, typename Context>
 static void Interpolate2DCUDABwd(
     const Context& dev_ctx,
     const DenseTensor& input,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& output_grad,
     const std::string& data_layout_str,
     int out_h,
@@ -1124,9 +1124,9 @@ template <typename T, typename Context>
 static void Interpolate3DCUDABwd(
     const Context& dev_ctx,
     const DenseTensor& input,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& output_grad,
     const std::string& data_layout_str,
     int out_d,
@@ -1334,9 +1334,9 @@ template <typename T, typename Context>
 void InterpolateGradKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& output_grad,
     const std::string& data_layout,
     int out_d,
@@ -1401,9 +1401,9 @@ template <typename T, typename Context>
 void BilinearInterpGradKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& out_grad,
     const std::string& data_layout,
     int out_d,
@@ -1435,9 +1435,9 @@ template <typename T, typename Context>
 void NearestInterpGradKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& out_grad,
     const std::string& data_layout,
     int out_d,
@@ -1469,9 +1469,9 @@ template <typename T, typename Context>
 void TrilinearInterpGradKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& out_grad,
     const std::string& data_layout,
     int out_d,
@@ -1503,9 +1503,9 @@ template <typename T, typename Context>
 void LinearInterpGradKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& out_grad,
     const std::string& data_layout,
     int out_d,
@@ -1537,9 +1537,9 @@ template <typename T, typename Context>
 void BicubicInterpGradKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& out_grad,
     const std::string& data_layout,
     int out_d,
diff --git a/paddle/phi/kernels/gpu/interpolate_kernel.cu b/paddle/phi/kernels/gpu/interpolate_kernel.cu
index 6e609aa11674e..3bd59c807103c 100644
--- a/paddle/phi/kernels/gpu/interpolate_kernel.cu
+++ b/paddle/phi/kernels/gpu/interpolate_kernel.cu
@@ -627,9 +627,9 @@ template <typename T, typename Context>
 static void Interpolate1DCUDAFwd(
     const Context& dev_ctx,
     const DenseTensor& input,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout_str,
     int out_w,
     const std::vector<float>& scale,
@@ -742,9 +742,9 @@ template <typename T, typename Context>
 static void Interpolate2DCUDAFwd(
     const Context& dev_ctx,
     const DenseTensor& input,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout_str,
     int out_h,
     int out_w,
@@ -997,9 +997,9 @@ template <typename T, typename Context>
 static void Interpolate3DCUDAFwd(
     const Context& dev_ctx,
     const DenseTensor& input,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout_str,
     int out_d,
     int out_h,
@@ -1221,9 +1221,9 @@ template <typename T, typename Context>
 void InterpolateKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout,
     int out_d,
     int out_h,
@@ -1283,9 +1283,9 @@ template <typename T, typename Context>
 void BilinearInterpKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout,
     int out_d,
     int out_h,
@@ -1315,9 +1315,9 @@ template <typename T, typename Context>
 void NearestInterpKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout,
     int out_d,
     int out_h,
@@ -1347,9 +1347,9 @@ template <typename T, typename Context>
 void TrilinearInterpKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout,
     int out_d,
     int out_h,
@@ -1379,9 +1379,9 @@ template <typename T, typename Context>
 void LinearInterpKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout,
     int out_d,
     int out_h,
@@ -1411,9 +1411,9 @@ template <typename T, typename Context>
 void BicubicInterpKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout,
     int out_d,
     int out_h,
diff --git a/paddle/phi/kernels/gpu/label_smooth_kernel.cu b/paddle/phi/kernels/gpu/label_smooth_kernel.cu
index 50f7548450ce7..bf7ac939eb389 100644
--- a/paddle/phi/kernels/gpu/label_smooth_kernel.cu
+++ b/paddle/phi/kernels/gpu/label_smooth_kernel.cu
@@ -53,7 +53,7 @@ __global__ void LabelSmoothRunDistKernel(const int N,
 template <typename T, typename Context>
 void LabelSmoothKernel(const Context& ctx,
                        const DenseTensor& label,
-                       paddle::optional<const DenseTensor&> prior_dist,
+                       const paddle::optional<DenseTensor>& prior_dist,
                        float epsilon,
                        DenseTensor* out) {
   auto label_dim = label.dims()[label.dims().size() - 1];
diff --git a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
index 146d307a59380..961937441e1cf 100644
--- a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
@@ -24,8 +24,8 @@ namespace phi {
 template <typename T, typename Context>
 void LayerNormGradKernel(const Context &dev_ctx,
                          const DenseTensor &x,
-                         paddle::optional<const DenseTensor &> scale_opt,
-                         paddle::optional<const DenseTensor &> bias_opt,
+                         const paddle::optional<DenseTensor> &scale_opt,
+                         const paddle::optional<DenseTensor> &bias_opt,
                          const DenseTensor &mean,
                          const DenseTensor &variance,
                          const DenseTensor &out_grad,
diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
index d87b7c2193811..72127042c16e0 100644
--- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
@@ -55,8 +55,8 @@ template class LayerNormDirectCUDAFunctor<float>;
 template <typename T, typename Context>
 void LayerNormKernel(const Context &dev_ctx,
                      const DenseTensor &x,
-                     paddle::optional<const DenseTensor &> scale_opt,
-                     paddle::optional<const DenseTensor &> bias_opt,
+                     const paddle::optional<DenseTensor> &scale_opt,
+                     const paddle::optional<DenseTensor> &bias_opt,
                      float epsilon,
                      int begin_norm_axis,
                      bool is_test,
diff --git a/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu b/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu
new file mode 100644
index 0000000000000..a7cd49c0e53f3
--- /dev/null
+++ b/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu
@@ -0,0 +1,122 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/nanmedian_grad_kernel.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+inline int GET_BLOCKS(const int N) {
+  return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
+}
+
+template <typename T>
+__global__ void KernelNanmedianGrad(const T* x_ptr,
+                                    const int64_t* medians_ptr,
+                                    const T* out_grad_ptr,
+                                    T* x_grad_ptr,
+                                    int64_t stride,
+                                    int64_t pre_dim,
+                                    T div_factor) {
+  CUDA_KERNEL_LOOP(index, pre_dim) {
+    int64_t offset = index * stride;
+    if (medians_ptr[2 * index] >= 0) {
+      if (medians_ptr[2 * index] == medians_ptr[2 * index + 1]) {
+        x_grad_ptr[offset + medians_ptr[2 * index]] = out_grad_ptr[index];
+      } else {
+        x_grad_ptr[offset + medians_ptr[2 * index]] =
+            out_grad_ptr[index] / div_factor;
+        x_grad_ptr[offset + medians_ptr[2 * index + 1]] =
+            out_grad_ptr[index] / div_factor;
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void CalcMedianGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& median_index,
+                          const DenseTensor& out_grad,
+                          DenseTensor* x_grad,
+                          T* x_grad_ptr) {
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(dev_ctx, x_grad, static_cast<T>(0));
+
+  auto stream = dev_ctx.stream();
+  const T* x_ptr = x.data<T>();
+  const int64_t* m_ptr = median_index.data<int64_t>();
+  const T* out_grad_ptr = out_grad.data<T>();
+
+  int64_t numel = x.numel();
+  auto x_dim = x.dims();
+  int64_t x_rank = x_dim.size();
+  int64_t stride = x_dim[x_rank - 1];
+  int64_t pre_dim = numel / stride;
+
+  T div_factor = static_cast<T>(2.0);
+  KernelNanmedianGrad<
+      T><<<GET_BLOCKS(pre_dim), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+      x_ptr, m_ptr, out_grad_ptr, x_grad_ptr, stride, pre_dim, div_factor);
+}
+
+template <typename T, typename Context>
+void BaseMedianGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& median_index,
+                          const DenseTensor& out_grad,
+                          const IntArray& axes,
+                          DenseTensor* x_grad) {
+  auto rank = x.dims().size();
+  T* x_grad_ptr = dev_ctx.template Alloc<T>(x_grad);
+  if (axes.size() && (rank > 1)) {
+    DenseTensor tmp_x_grad(*x_grad);
+    CalcMedianGradKernel<T, Context>(
+        dev_ctx, x, median_index, out_grad, &tmp_x_grad, x_grad_ptr);
+    PostprocessMedianGradKernel<T, Context>(dev_ctx, &tmp_x_grad, axes, x_grad);
+  } else {
+    CalcMedianGradKernel<T, Context>(
+        dev_ctx, x, median_index, out_grad, x_grad, x_grad_ptr);
+  }
+}
+
+template <typename T, typename Context>
+void NanmedianGradKernel(const Context& dev_ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& median_index,
+                         const DenseTensor& out_grad,
+                         const IntArray& axes,
+                         bool keep_dim,
+                         DenseTensor* x_grad) {
+  BaseMedianGradKernel<T, Context>(
+      dev_ctx, input, median_index, out_grad, axes, x_grad);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(nanmedian_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::NanmedianGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/nanmedian_kernel.cu b/paddle/phi/kernels/gpu/nanmedian_kernel.cu
new file mode 100644
index 0000000000000..5975e2748997e
--- /dev/null
+++ b/paddle/phi/kernels/gpu/nanmedian_kernel.cu
@@ -0,0 +1,287 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/nanmedian_kernel.h"
+#include "paddle/phi/kernels/top_k_kernel.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+inline int GET_BLOCKS(const int N) {
+  return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
+}
+
+template <typename T>
+__global__ void KernelNanCounts(const T* input,
+                                const int numel,
+                                const int64_t pre_dim,
+                                const int64_t stride,
+                                T min_val,
+                                int64_t* nan_total,
+                                int64_t* nan_counts) {
+  extern __shared__ int64_t buf[];
+  for (int i = threadIdx.x; i < pre_dim; i += blockDim.x) {
+    buf[i] = 0;
+    nan_counts[i] = 0;
+  }
+
+  if (threadIdx.x == 0) {
+    nan_total[0] = 0;
+    nan_total[1] = 0;
+  }
+
+  __syncthreads();
+
+  CUDA_KERNEL_LOOP(index, numel) {
+    const T x = input[index];
+    if (isnan(static_cast<float>(x))) {
+      auto bin = static_cast<int64_t>(index / stride);
+      paddle::platform::CudaAtomicAdd(&buf[bin], 1);
+    }
+  }
+  __syncthreads();
+
+  for (int i = threadIdx.x; i < pre_dim; i += blockDim.x) {
+    paddle::platform::CudaAtomicAdd(&nan_counts[i], buf[i]);
+    paddle::platform::CudaAtomicAdd(&nan_total[0], buf[i]);
+    paddle::platform::CudaAtomicMax(&nan_total[1], stride - buf[i]);
+  }
+}
+
+template <typename T>
+__global__ void CalcMedianKernel(const T* sort_out_ptr,
+                                 const int64_t* sort_indices_ptr,
+                                 int64_t* median_val,
+                                 T* output,
+                                 T div_factor,
+                                 const bool is_odd,
+                                 const int64_t pre_dim,
+                                 const int64_t stride) {
+  CUDA_KERNEL_LOOP(index, pre_dim) {
+    int64_t pos = static_cast<int64_t>((index + 1) * stride) - 1;
+    if (is_odd) {
+      median_val[index * 2] = sort_indices_ptr[pos];
+      median_val[index * 2 + 1] = sort_indices_ptr[pos];
+      output[index] = sort_out_ptr[pos];
+    } else {
+      median_val[index * 2] =
+          pos > 0 ? sort_indices_ptr[pos - 1] : sort_indices_ptr[pos];
+      median_val[index * 2 + 1] = sort_indices_ptr[pos];
+      T median_val_left = pos > 0 ? sort_out_ptr[pos - 1] : sort_out_ptr[pos];
+      T median_val_right = sort_out_ptr[pos];
+      output[index] = (median_val_left + median_val_right) / div_factor;
+    }
+  }
+}
+
+template <typename T>
+__global__ void CalcNanmedianKernel(const T* sort_out_ptr,
+                                    const int64_t* sort_indices_ptr,
+                                    int64_t* nan_counts,
+                                    int64_t* median_val,
+                                    T* output,
+                                    const bool is_odd,
+                                    const int64_t pre_dim,
+                                    const int64_t max_valid_num,
+                                    const int64_t stride,
+                                    const T div_factor,
+                                    const T nan_val) {
+  CUDA_KERNEL_LOOP(index, pre_dim) {
+    int64_t pos = static_cast<int64_t>(index * max_valid_num);
+    int64_t nan_cnt = nan_counts[index];
+    if (nan_cnt == stride) {
+      median_val[index * 2] = -1;
+      median_val[index * 2 + 1] = -1;
+      output[index] = nan_val;
+    } else {
+      int64_t nan_k =
+          nan_cnt > 0 ? static_cast<int64_t>(stride - nan_cnt) : max_valid_num;
+      int64_t row_pos = static_cast<int64_t>(nan_k >> 1);
+      pos += row_pos;
+
+      if (nan_k & 1) {
+        median_val[index * 2] = sort_indices_ptr[pos];
+        median_val[index * 2 + 1] = sort_indices_ptr[pos];
+        output[index] = sort_out_ptr[pos];
+      } else {
+        median_val[index * 2] =
+            pos > 0 ? sort_indices_ptr[pos - 1] : sort_indices_ptr[pos];
+        median_val[index * 2 + 1] = sort_indices_ptr[pos];
+        T median_val_left = pos > 0 ? sort_out_ptr[pos - 1] : sort_out_ptr[pos];
+        T median_val_right = sort_out_ptr[pos];
+        output[index] = (median_val_left + median_val_right) / div_factor;
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ProcessMedianKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         bool ignore_nan,
+                         DenseTensor* out,
+                         int64_t* m_ptr) {
+  bool should_ignore_nan = ignore_nan;
+  auto stream = dev_ctx.stream();
+
+  const T* x_ptr = x.data<T>();
+  T* o_ptr = dev_ctx.template Alloc<T>(out);
+  int64_t numel = x.numel();
+  auto x_dim = x.dims();
+  int64_t x_rank = x_dim.size();
+  int64_t stride = x_dim[x_rank - 1];
+  int64_t pre_dim = numel / stride;
+  int64_t i = 0;
+
+  DenseTensor nan_counts, nan_stat;
+  int64_t* nan_counts_ptr;
+  int64_t max_valid_num = 0;
+  if (should_ignore_nan) {
+    nan_counts.Resize(phi::make_ddim({pre_dim}));
+    dev_ctx.template Alloc<int64_t>(&nan_counts);
+    nan_counts_ptr = nan_counts.data<int64_t>();
+    nan_stat.Resize(phi::make_ddim({2}));
+    int64_t* nan_stat_mem = dev_ctx.template Alloc<int64_t>(&nan_stat);
+    int64_t* nan_stat_ptr = nan_stat.data<int64_t>();
+
+    KernelNanCounts<T><<<GET_BLOCKS(numel),
+                         PADDLE_CUDA_NUM_THREADS,
+                         pre_dim * sizeof(int64_t),
+                         stream>>>(x_ptr,
+                                   numel,
+                                   pre_dim,
+                                   stride,
+                                   std::numeric_limits<T>::min(),
+                                   nan_stat_ptr,
+                                   nan_counts_ptr);
+
+    auto nan_stat_mem_cpu =
+        paddle::memory::Alloc(phi::CPUPlace(), sizeof(int64_t) * 2);
+    int64_t* nan_stat_cpu_ptr =
+        reinterpret_cast<int64_t*>(nan_stat_mem_cpu->ptr());
+    paddle::memory::Copy(phi::CPUPlace(),
+                         nan_stat_cpu_ptr,
+                         dev_ctx.GetPlace(),
+                         nan_stat_mem,
+                         sizeof(int64_t) * 2,
+                         stream);
+
+    // all elements are nan values
+    T nan_val = std::numeric_limits<T>::quiet_NaN();
+    if (nan_stat_cpu_ptr[0] == numel) {
+      FullLikeKernel<T, Context>(dev_ctx, x, nan_val, x.dtype(), out);
+      return;
+    }
+
+    should_ignore_nan = nan_stat_cpu_ptr[0] > 0;
+    max_valid_num = nan_stat_cpu_ptr[1];
+  }
+
+  int64_t sort_k = should_ignore_nan ? max_valid_num : ((stride >> 1) + 1);
+  bool is_ori_odd = stride & 1;
+
+  DenseTensor sort_out, sort_indices;
+  auto sort_dim = x.dims();
+  int64_t rank = sort_dim.size();
+  sort_dim[rank - 1] = sort_k;
+  sort_out.Resize(sort_dim);
+  sort_indices.Resize(sort_dim);
+
+  dev_ctx.template Alloc<T>(&sort_out);
+  T* sort_out_ptr = sort_out.data<T>();
+  dev_ctx.template Alloc<int64_t>(&sort_indices);
+  int64_t* sort_indices_ptr = sort_indices.data<int64_t>();
+
+  TopkKernel<T, Context>(
+      dev_ctx, x, Scalar(sort_k), -1, false, true, &sort_out, &sort_indices);
+
+  T div_factor = static_cast<T>(2.0);
+  T nan_val = std::numeric_limits<T>::quiet_NaN();
+  if (should_ignore_nan) {
+    CalcNanmedianKernel<
+        T><<<GET_BLOCKS(pre_dim), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+        sort_out_ptr,
+        sort_indices_ptr,
+        nan_counts_ptr,
+        m_ptr,
+        o_ptr,
+        is_ori_odd,
+        pre_dim,
+        max_valid_num,
+        stride,
+        div_factor,
+        nan_val);
+  } else {
+    CalcMedianKernel<
+        T><<<GET_BLOCKS(pre_dim), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+        sort_out_ptr,
+        sort_indices_ptr,
+        m_ptr,
+        o_ptr,
+        div_factor,
+        is_ori_odd,
+        pre_dim,
+        sort_k);
+  }
+}
+
+template <typename T, typename Context>
+void BaseMedianKernel(const Context& dev_ctx,
+                      const DenseTensor& input,
+                      const IntArray& axes,
+                      bool ignore_nan,
+                      DenseTensor* out,
+                      DenseTensor* median_index) {
+  DenseTensor x;
+  auto rank = input.dims().size();
+  if ((axes.size() == 0) || rank <= 1) {
+    x = input;
+    x.Resize({input.numel()});
+  } else {
+    PreprocessMedianKernel<T, Context>(dev_ctx, input, axes, &x);
+  }
+
+  int64_t* m_ptr = dev_ctx.template Alloc<int64_t>(median_index);
+  ProcessMedianKernel<T, Context>(dev_ctx, x, ignore_nan, out, m_ptr);
+  out->Resize(out->dims());
+}
+
+template <typename T, typename Context>
+void NanmedianKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const IntArray& axes,
+                     bool keepdim,
+                     DenseTensor* out,
+                     DenseTensor* median_index) {
+  BaseMedianKernel<T, Context>(dev_ctx, x, axes, true, out, median_index);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(nanmedian,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::NanmedianKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu
index 43106ec1d863f..407f33c40089c 100644
--- a/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu
@@ -23,7 +23,7 @@ template <typename T, typename Context>
 void NllLossGradKernel(const Context& dev_ctx,
                        const DenseTensor& x,
                        const DenseTensor& labels,
-                       paddle::optional<const DenseTensor&> weight,
+                       const paddle::optional<DenseTensor>& weight,
                        const DenseTensor& total_weight,
                        const DenseTensor& dout,
                        int64_t ignore_index,
diff --git a/paddle/phi/kernels/gpu/nll_loss_kernel.cu b/paddle/phi/kernels/gpu/nll_loss_kernel.cu
index 6b0e1fef7ba9a..99a8b10b11b5c 100644
--- a/paddle/phi/kernels/gpu/nll_loss_kernel.cu
+++ b/paddle/phi/kernels/gpu/nll_loss_kernel.cu
@@ -24,7 +24,7 @@ template <typename T, typename Context>
 void NllLossRawKernel(const Context& dev_ctx,
                       const DenseTensor& input,
                       const DenseTensor& label,
-                      paddle::optional<const DenseTensor&> weight,
+                      const paddle::optional<DenseTensor>& weight,
                       int64_t ignore_index,
                       const std::string& reduction,
                       DenseTensor* out,
diff --git a/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu
index 6745653eba7d1..45e4730e173fe 100644
--- a/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu
@@ -107,7 +107,7 @@ template <typename T, typename Context>
 void PsroiPoolGradKernel(const Context& ctx,
                          const DenseTensor& x,
                          const DenseTensor& rois,
-                         paddle::optional<const DenseTensor&> rois_num,
+                         const paddle::optional<DenseTensor>& rois_num,
                          const DenseTensor& dout,
                          int pooled_height,
                          int pooled_width,
diff --git a/paddle/phi/kernels/gpu/psroi_pool_kernel.cu b/paddle/phi/kernels/gpu/psroi_pool_kernel.cu
index 8f9be001ba763..f296d0d20743e 100644
--- a/paddle/phi/kernels/gpu/psroi_pool_kernel.cu
+++ b/paddle/phi/kernels/gpu/psroi_pool_kernel.cu
@@ -107,7 +107,7 @@ template <typename T, typename Context>
 void PsroiPoolKernel(const Context& ctx,
                      const DenseTensor& x,
                      const DenseTensor& rois,
-                     paddle::optional<const DenseTensor&> rois_num,
+                     const paddle::optional<DenseTensor>& rois_num,
                      int pooled_height,
                      int pooled_width,
                      int output_channels,
diff --git a/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc b/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc
index 76407475281da..98c2f618e7868 100644
--- a/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc
@@ -80,7 +80,7 @@ void RnnGradKernel(const Context &dev_ctx,
                    const DenseTensor &x,
                    const std::vector<const DenseTensor *> &pre_state,
                    const std::vector<const DenseTensor *> &weight_list,
-                   paddle::optional<const DenseTensor &> sequence_length,
+                   const paddle::optional<DenseTensor> &sequence_length,
                    const DenseTensor &out,
                    const DenseTensor &dropout_state,
                    const DenseTensor &reserve,
diff --git a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
index d30b7ec34d43c..5a19d5b89f0e3 100644
--- a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
@@ -134,7 +134,7 @@ void RnnKernel(const Context &dev_ctx,
                const DenseTensor &x,
                const std::vector<const DenseTensor *> &pre_state,
                const std::vector<const DenseTensor *> &weight_list,
-               paddle::optional<const DenseTensor &> sequence_length,
+               const paddle::optional<DenseTensor> &sequence_length,
                float dropout_prob,
                bool is_bidirec,
                int input_size,
@@ -175,17 +175,13 @@ void RnnKernel(const Context &dev_ctx,
         mode));
 
   if (!is_test) {
-    int device_id = dev_ctx.GetPlace().GetDeviceId();
-    auto gen_cuda = paddle::framework::GetDefaultCUDAGenerator(device_id);
-    if (gen_cuda->GetIsInitPy() && seed == 0) {
-      // If perform `manual_seed` in python and inner seed is not specified
-      // (equals 0), use global generator generated seed.
+    if (seed == 0) {
+      // If not specify seed, use global Generator to generate seed.
+      int device_id = dev_ctx.GetPlace().GetDeviceId();
+      auto gen_cuda = paddle::framework::DefaultCUDAGenerator(device_id);
       seed = static_cast<int>(gen_cuda->Random64());
-    } else if (seed == 0) {
-      // use random generated seed
-      std::random_device rd;
-      seed = rd();
-    }  // else use `ctx.Attr<int>("seed")` specified seed
+    }
+    // else use `ctx.Attr<int>("seed")` specified seed
   }
 
   const T *x_data = x.data<T>();
diff --git a/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu
index cf076128b6939..9f9ea6753402b 100644
--- a/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu
@@ -172,7 +172,7 @@ template <typename T, typename Context>
 void RoiAlignGradKernel(const Context& dev_ctx,
                         const DenseTensor& x,
                         const DenseTensor& boxes,
-                        paddle::optional<const DenseTensor&> boxes_num,
+                        const paddle::optional<DenseTensor>& boxes_num,
                         const DenseTensor& out_grad,
                         int pooled_height,
                         int pooled_width,
diff --git a/paddle/phi/kernels/gpu/roi_align_kernel.cu b/paddle/phi/kernels/gpu/roi_align_kernel.cu
index cb3375dee95a5..fc24179ed3d26 100644
--- a/paddle/phi/kernels/gpu/roi_align_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_align_kernel.cu
@@ -139,7 +139,7 @@ template <typename T, typename Context>
 void RoiAlignKernel(const Context& dev_ctx,
                     const DenseTensor& x,
                     const DenseTensor& boxes,
-                    paddle::optional<const DenseTensor&> boxes_num,
+                    const paddle::optional<DenseTensor>& boxes_num,
                     int pooled_height,
                     int pooled_width,
                     float spatial_scale,
diff --git a/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
index d093a71d23f4e..1a5af93c562bf 100644
--- a/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
@@ -75,7 +75,7 @@ template <typename T, typename Context>
 void RoiPoolGradKernel(const Context& dev_ctx,
                        const DenseTensor& x,
                        const DenseTensor& boxes,
-                       paddle::optional<const DenseTensor&> boxes_num,
+                       const paddle::optional<DenseTensor>& boxes_num,
                        const DenseTensor& arg_max,
                        const DenseTensor& out_grad,
                        int pooled_height,
diff --git a/paddle/phi/kernels/gpu/roi_pool_kernel.cu b/paddle/phi/kernels/gpu/roi_pool_kernel.cu
index ab33e2cf64751..32ea6223c9c2a 100644
--- a/paddle/phi/kernels/gpu/roi_pool_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_pool_kernel.cu
@@ -104,7 +104,7 @@ template <typename T, typename Context>
 void RoiPoolKernel(const Context& dev_ctx,
                    const DenseTensor& x,
                    const DenseTensor& boxes,
-                   paddle::optional<const DenseTensor&> boxes_num,
+                   const paddle::optional<DenseTensor>& boxes_num,
                    int pooled_height,
                    int pooled_width,
                    float spatial_scale,
diff --git a/paddle/phi/kernels/gpu/rrelu_grad_kernel.cu b/paddle/phi/kernels/gpu/rrelu_grad_kernel.cu
new file mode 100644
index 0000000000000..44dc31ed5d926
--- /dev/null
+++ b/paddle/phi/kernels/gpu/rrelu_grad_kernel.cu
@@ -0,0 +1,86 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/rrelu_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
+#include "paddle/phi/kernels/gpu/prelu_funcs.h"
+#include "paddle/phi/kernels/primitive/functor_primitives.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void RReluOpGradKernel(const T* x_ptr,
+                                  const T* noise_ptr,
+                                  const T* out_grad_ptr,
+                                  T* x_grad_ptr,
+                                  int numel) {
+  CUDA_KERNEL_LOOP(index, numel) {
+    T scale = noise_ptr[index];
+    T x = x_ptr[index];
+    T out_grad = out_grad_ptr[index];
+    T zero = static_cast<T>(0);
+    x_grad_ptr[index] = (x < zero) ? scale * out_grad : out_grad;
+  }
+}
+
+template <typename T>
+class RReluOpGradFunctor {
+ public:
+  void operator()(gpuStream_t stream,
+                  const T* x,
+                  const T* noise,
+                  const T* out_grad,
+                  T* x_grad,
+                  int numel) {
+    RReluOpGradKernel<
+        T><<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0, stream>>>(
+        x, noise, out_grad, x_grad, numel);
+  }
+};
+
+template <typename T, typename Context>
+void RReluGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& noise,
+                     const DenseTensor& out_grad,
+                     DenseTensor* x_grad) {
+  if (!x_grad) return;
+  dev_ctx.template Alloc<T>(x_grad);
+
+  const T* x_ptr = x.data<T>();
+  const T* n_ptr = noise.data<T>();
+  const T* out_grad_ptr = out_grad.data<T>();
+  T* x_grad_ptr = dev_ctx.template Alloc<T>(x_grad);
+
+  int numel = x.numel();
+  auto stream = dev_ctx.stream();
+
+  RReluOpGradFunctor<T> rrelu_grad;
+  rrelu_grad(stream, x_ptr, n_ptr, out_grad_ptr, x_grad_ptr, numel);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(rrelu_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::RReluGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/rrelu_kernel.cu b/paddle/phi/kernels/gpu/rrelu_kernel.cu
new file mode 100644
index 0000000000000..39582d5872a70
--- /dev/null
+++ b/paddle/phi/kernels/gpu/rrelu_kernel.cu
@@ -0,0 +1,112 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/rrelu_kernel.h"
+
+namespace phi {
+
+template <typename T>
+struct RReluTrainCudaFunctor {
+ public:
+  RReluTrainCudaFunctor(const T* in, T* out, T* noise)
+      : in_(in), out_(out), noise_(noise) {
+    zero_ = static_cast<T>(0);
+  }
+
+  __device__ void operator()(int64_t idx) {
+    T x = in_[idx];
+    if (x < zero_) {
+      out_[idx] = noise_[idx] * x;
+    } else {
+      out_[idx] = x;
+      noise_[idx] = 1.0;
+    }
+  }
+
+ private:
+  const T* in_;
+  T* out_;
+  T* noise_;
+  T zero_;
+};
+
+template <typename T>
+struct RReluTestCudaFunctor {
+ public:
+  RReluTestCudaFunctor(const T* in, T* out, T* noise, T mid_val)
+      : in_(in), out_(out), noise_(noise), mid_val_(mid_val) {
+    zero_ = static_cast<T>(0);
+  }
+
+  __device__ void operator()(int64_t idx) {
+    T x = in_[idx];
+    if (x < zero_) {
+      out_[idx] = mid_val_ * x;
+      noise_[idx] = mid_val_;
+    } else {
+      out_[idx] = x;
+      noise_[idx] = 1.0;
+    }
+  }
+
+ private:
+  const T* in_;
+  T* out_;
+  T* noise_;
+  T zero_;
+  T mid_val_;
+};
+
+template <typename T, typename Context>
+void RReluKernel(const Context& ctx,
+                 const DenseTensor& x,
+                 const float lower,
+                 const float upper,
+                 bool is_test,
+                 DenseTensor* out,
+                 DenseTensor* noise) {
+  const T* x_data = x.data<T>();
+  T* out_data = ctx.template Alloc<T>(out);
+  T* noise_data = ctx.template Alloc<T>(noise);
+  auto size = x.numel();
+  if (size <= 0) return;
+
+  phi::funcs::ForRange<Context> for_range(ctx, size);
+  if (is_test) {
+    T mid_val = static_cast<T>((lower + upper) / 2.0);
+    RReluTestCudaFunctor<T> functor(x_data, out_data, noise_data, mid_val);
+    for_range(functor);
+  } else {
+    using MT = typename kps::details::MPTypeTrait<T>::Type;
+    funcs::uniform_distribution<MT> dist;
+    funcs::uniform_real_transform<MT> trans(lower, upper);
+    funcs::distribution_and_transform<T>(ctx, noise, dist, trans);
+    RReluTrainCudaFunctor<T> functor(x_data, out_data, noise_data);
+    for_range(functor);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(rrelu,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::RReluKernel,
+                   float,
+                   phi::dtype::float16,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/sgd_kernel.cu b/paddle/phi/kernels/gpu/sgd_kernel.cu
index 7dd5a03383fd2..d71112a2f2884 100644
--- a/paddle/phi/kernels/gpu/sgd_kernel.cu
+++ b/paddle/phi/kernels/gpu/sgd_kernel.cu
@@ -69,7 +69,7 @@ void SGDDenseKernel(const Context& dev_ctx,
                     const DenseTensor& param,
                     const DenseTensor& learning_rate,
                     const DenseTensor& grad,
-                    paddle::optional<const DenseTensor&> master_param,
+                    const paddle::optional<DenseTensor>& master_param,
                     bool multi_precision,
                     DenseTensor* param_out,
                     DenseTensor* master_param_out) {
@@ -106,7 +106,7 @@ void SGDDenseParamSparseGradKernel(
     const DenseTensor& param,
     const DenseTensor& learning_rate,
     const SelectedRows& grad,
-    paddle::optional<const DenseTensor&> master_param,
+    const paddle::optional<DenseTensor>& master_param,
     bool multi_precision,
     DenseTensor* param_out,
     DenseTensor* master_param_out) {
@@ -175,7 +175,7 @@ void SGDSparseParamSparseGradKernel(
     const SelectedRows& param,
     const DenseTensor& learning_rate,
     const SelectedRows& grad,
-    paddle::optional<const SelectedRows&> master_param,
+    const paddle::optional<SelectedRows>& master_param,
     bool multi_precision,
     SelectedRows* param_out,
     SelectedRows* master_param_out) {
diff --git a/paddle/phi/kernels/gpu/tile_kernel.cu b/paddle/phi/kernels/gpu/tile_kernel.cu
index 0c3c29e82c42a..990877a8445cb 100644
--- a/paddle/phi/kernels/gpu/tile_kernel.cu
+++ b/paddle/phi/kernels/gpu/tile_kernel.cu
@@ -27,4 +27,5 @@ PD_REGISTER_KERNEL(tile,
                    double,
                    int,
                    int64_t,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
index 5b6ae9d09bff2..33ecb4d6eb544 100644
--- a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
+++ b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
@@ -90,34 +90,25 @@ void TruncatedGaussianRandomKernel(const Context& dev_ctx,
                                    int seed,
                                    DataType dtype,
                                    DenseTensor* out) {
-  auto tensor = out;
-
-  T* data = dev_ctx.template Alloc<T>(tensor);
-
-  bool seed_flag = false;
-  if (seed == 0) {
-    std::random_device rd;
-    seed = rd();
-    seed_flag = true;
-  }
+  T* data = dev_ctx.template Alloc<T>(out);
 
   thrust::counting_iterator<int64_t> index_sequence_begin(0);
-  int64_t size = tensor->numel();
+  int64_t size = out->numel();
 
   auto gen_cuda = dev_ctx.GetGenerator();
-
-  if (gen_cuda->GetIsInitPy() && seed_flag) {
+  if (seed == 0) {
+    // use global Generator seed
     auto seed_offset = gen_cuda->IncrementOffset(1);
-    int64_t gen_offset = size * seed_offset.second;
-    thrust::transform(index_sequence_begin,
-                      index_sequence_begin + size,
-                      thrust::device_ptr<T>(data),
-                      TruncatedNormalOffset<T>(mean,
-                                               std,
-                                               std::numeric_limits<T>::min(),
-                                               seed_offset.first,
-                                               gen_offset));
+    uint64_t seed = seed_offset.first;
+    uint64_t offset = seed_offset.second;
+    thrust::transform(
+        index_sequence_begin,
+        index_sequence_begin + size,
+        thrust::device_ptr<T>(data),
+        TruncatedNormalOffset<T>(
+            mean, std, std::numeric_limits<T>::min(), seed, size * offset));
   } else {
+    // use OP seed
     thrust::transform(
         index_sequence_begin,
         index_sequence_begin + size,
diff --git a/paddle/phi/kernels/gpu/uniform_random_kernel.cu b/paddle/phi/kernels/gpu/uniform_random_kernel.cu
index a4aea10cfe762..68e61b7328971 100644
--- a/paddle/phi/kernels/gpu/uniform_random_kernel.cu
+++ b/paddle/phi/kernels/gpu/uniform_random_kernel.cu
@@ -65,22 +65,15 @@ void UniformRandomRawKernel(const Context& dev_ctx,
                             float diag_val,
                             DenseTensor* out) {
   out->Resize(phi::make_ddim(shape.GetData()));
-  T* data = dev_ctx.template Alloc<T>(out);
-  auto size = out->numel();
-  bool seed_flag = false;
+  dev_ctx.template Alloc<T>(out);
   if (seed == 0) {
-    std::random_device rd;
-    seed = rd();
-    seed_flag = true;
-  }
-
-  auto generator = dev_ctx.GetGenerator();
-  if (generator->GetIsInitPy() && seed_flag) {
+    // Use global Generator seed
     using MT = typename kps::details::MPTypeTrait<T>::Type;
     funcs::uniform_distribution<MT> dist;
     funcs::uniform_real_transform<MT> trans(min, max);
     funcs::distribution_and_transform<T>(dev_ctx, out, dist, trans);
   } else {
+    // Use OP seed
     auto func =
         UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val);
     IndexKernel<T, UniformGenerator<T>>(dev_ctx, out, func);
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
index 58c7ea69869b3..53e4c39d8bcee 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
@@ -47,8 +47,8 @@ void ConvCudnnGradGradKernel(
     const DenseTensor& input,
     const DenseTensor& filter,
     const DenseTensor& out_grad,
-    paddle::optional<const DenseTensor&> input_grad_grad,
-    paddle::optional<const DenseTensor&> filter_grad_grad,
+    const paddle::optional<DenseTensor>& input_grad_grad,
+    const paddle::optional<DenseTensor>& filter_grad_grad,
     const std::vector<int>& strides,
     const std::vector<int>& paddings_t,
     const std::string& padding_algorithm,
@@ -670,8 +670,8 @@ void ConvCudnnGradGradKernel(
 template <typename T, typename Context>
 void DepthwiseConvCudnnGradGradKernel(
     const Context& ctx,
-    paddle::optional<const DenseTensor&> input_grad_grad,
-    paddle::optional<const DenseTensor&> filter_grad_grad,
+    const paddle::optional<DenseTensor>& input_grad_grad,
+    const paddle::optional<DenseTensor>& filter_grad_grad,
     const DenseTensor& out_grad,
     const DenseTensor& input,
     const DenseTensor& filter,
@@ -711,11 +711,11 @@ void DepthwiseConvCudnnGradGradKernel(
 template <typename T, typename Context>
 void Conv3DCudnnGradGradKernel(
     const Context& ctx,
-    paddle::optional<const DenseTensor&> input_grad_grad,
-    paddle::optional<const DenseTensor&> filter_grad_grad,
-    const DenseTensor& out_grad,
     const DenseTensor& input,
     const DenseTensor& filter,
+    const DenseTensor& out_grad,
+    const paddle::optional<DenseTensor>& input_grad_grad,
+    const paddle::optional<DenseTensor>& filter_grad_grad,
     const std::vector<int>& strides,
     const std::vector<int>& paddings_t,
     const std::string& padding_algorithm,
@@ -725,9 +725,9 @@ void Conv3DCudnnGradGradKernel(
     bool use_addto,
     int workspace_size_MB,
     bool exhaustive_search_t,
-    DenseTensor* out_grad_grad,
     DenseTensor* input_grad,
-    DenseTensor* filter_grad) {
+    DenseTensor* filter_grad,
+    DenseTensor* out_grad_grad) {
   ConvCudnnGradGradKernel<T>(ctx,
                              input,
                              filter,
diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
index 77159bfc876da..58781e8c6e491 100644
--- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
+++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
@@ -888,19 +888,6 @@ void SoftmaxBackwardCudnnKernel(const GPUContext& dev_ctx,
 #endif
 }
 
-template <typename T>
-static bool CanUseCudnnSoftmax(const GPUContext& dev_ctx) {
-  if (dev_ctx.cudnn_handle() != nullptr) {
-    if (std::is_same<T, phi::dtype::bfloat16>::value) {
-#if CUDNN_VERSION < 8100
-      return false;
-#endif
-    }
-    return true;
-  }
-  return false;
-}
-
 #if CUDNN_VERSION < 8100
 template <>
 inline void SoftmaxForwardCudnnKernel<phi::dtype::bfloat16>(
@@ -927,6 +914,25 @@ inline void SoftmaxBackwardCudnnKernel<phi::dtype::bfloat16>(
 }
 #endif
 
+template <typename T>
+bool UseCudnnSoftmax(const GPUContext& ctx, int softmax_dim, bool last_dim) {
+  bool cudnn_available = ctx.cudnn_handle();
+  if (!ctx.cudnn_handle()) {
+    if (std::is_same<T, phi::dtype::bfloat16>::value) {
+#if CUDNN_VERSION < 8100
+      cudnn_available = false;
+#endif
+    }
+  }
+  constexpr int max_dim = 512;
+  if (!cudnn_available || !last_dim ||
+      (softmax_dim <= max_dim && sizeof(T) <= 4)) {
+    return false;
+  } else {
+    return true;
+  }
+}
+
 template <typename T, bool LogMode = false>
 void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                     const DenseTensor& x,
@@ -941,10 +947,7 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
   int dim = tensor_dims[1];
   int D = tensor_dims[2];
 
-  constexpr int max_dim = 512;
-
-  if (D == 1 &&
-      (!CanUseCudnnSoftmax<T>(dev_ctx) || (dim <= max_dim && sizeof(T) <= 4))) {
+  if (D == 1 && !UseCudnnSoftmax<T>(dev_ctx, dim, true)) {
     int dim_log2 = static_cast<int>(Log2Ceil(dim));
     int dim_ceil = 1 << dim_log2;
     int warp_size = (dim_ceil < 32) ? dim_ceil : 32;
@@ -1016,10 +1019,7 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx,
   int dim = tensor_dims[1];
   int D = tensor_dims[2];
 
-  constexpr int max_dim = 512;
-
-  if (D == 1 &&
-      (!CanUseCudnnSoftmax<T>(dev_ctx) || (dim <= max_dim && sizeof(T) <= 4))) {
+  if (D == 1 && !UseCudnnSoftmax<T>(dev_ctx, dim, true)) {
     int dim_log2 = Log2Ceil(dim);
     int dim_ceil = 1 << dim_log2;
     int warp_size = (dim_ceil < 32) ? dim_ceil : 32;
diff --git a/paddle/phi/kernels/graph_reindex_kernel.h b/paddle/phi/kernels/graph_reindex_kernel.h
index 68f1ebc6f5cc4..12a742006ee73 100644
--- a/paddle/phi/kernels/graph_reindex_kernel.h
+++ b/paddle/phi/kernels/graph_reindex_kernel.h
@@ -23,8 +23,8 @@ void GraphReindexKernel(const Context& dev_ctx,
                         const DenseTensor& x,
                         const DenseTensor& neighbors,
                         const DenseTensor& count,
-                        paddle::optional<const DenseTensor&> hashtable_value,
-                        paddle::optional<const DenseTensor&> hashtable_index,
+                        const paddle::optional<DenseTensor>& hashtable_value,
+                        const paddle::optional<DenseTensor>& hashtable_index,
                         bool flag_buffer_hashtable,
                         DenseTensor* reindex_src,
                         DenseTensor* reindex_dst,
diff --git a/paddle/phi/kernels/graph_sample_neighbors_kernel.h b/paddle/phi/kernels/graph_sample_neighbors_kernel.h
index f7d205bd08ad0..065c7f141225d 100644
--- a/paddle/phi/kernels/graph_sample_neighbors_kernel.h
+++ b/paddle/phi/kernels/graph_sample_neighbors_kernel.h
@@ -24,8 +24,8 @@ void GraphSampleNeighborsKernel(
     const DenseTensor& row,
     const DenseTensor& col_ptr,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> eids,
-    paddle::optional<const DenseTensor&> perm_buffer,
+    const paddle::optional<DenseTensor>& eids,
+    const paddle::optional<DenseTensor>& perm_buffer,
     int sample_size,
     bool return_eids,
     bool flag_perm_buffer,
diff --git a/paddle/phi/kernels/graph_send_recv_grad_kernel.h b/paddle/phi/kernels/graph_send_recv_grad_kernel.h
index c0b1a34d09c00..fbb6db358a476 100644
--- a/paddle/phi/kernels/graph_send_recv_grad_kernel.h
+++ b/paddle/phi/kernels/graph_send_recv_grad_kernel.h
@@ -25,8 +25,8 @@ void GraphSendRecvGradKernel(const Context& ctx,
                              const DenseTensor& x,
                              const DenseTensor& src_index,
                              const DenseTensor& dst_index,
-                             paddle::optional<const DenseTensor&> out,
-                             paddle::optional<const DenseTensor&> dst_count,
+                             const paddle::optional<DenseTensor>& out,
+                             const paddle::optional<DenseTensor>& dst_count,
                              const DenseTensor& out_grad,
                              const std::string& pool_type,
                              DenseTensor* x_grad);
diff --git a/paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h b/paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h
index 7922a767db23c..c0da8faadd592 100644
--- a/paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h
+++ b/paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h
@@ -23,9 +23,9 @@ void HierarchicalSigmoidGradKernel(const Context& ctx,
                                    const DenseTensor& x,
                                    const DenseTensor& w,
                                    const DenseTensor& label,
-                                   paddle::optional<const DenseTensor&> path,
-                                   paddle::optional<const DenseTensor&> code,
-                                   paddle::optional<const DenseTensor&> bias,
+                                   const paddle::optional<DenseTensor>& path,
+                                   const paddle::optional<DenseTensor>& code,
+                                   const paddle::optional<DenseTensor>& bias,
                                    const DenseTensor& pre_out,
                                    const DenseTensor& out_grad,
                                    int num_classes,
diff --git a/paddle/phi/kernels/hierarchical_sigmoid_kernel.h b/paddle/phi/kernels/hierarchical_sigmoid_kernel.h
index 619b022904b17..e32306b645a6f 100644
--- a/paddle/phi/kernels/hierarchical_sigmoid_kernel.h
+++ b/paddle/phi/kernels/hierarchical_sigmoid_kernel.h
@@ -23,9 +23,9 @@ void HierarchicalSigmoidKernel(const Context& ctx,
                                const DenseTensor& x,
                                const DenseTensor& w,
                                const DenseTensor& label,
-                               paddle::optional<const DenseTensor&> path,
-                               paddle::optional<const DenseTensor&> code,
-                               paddle::optional<const DenseTensor&> bias,
+                               const paddle::optional<DenseTensor>& path,
+                               const paddle::optional<DenseTensor>& code,
+                               const paddle::optional<DenseTensor>& bias,
                                int num_classes,
                                bool remote_prefetch,
                                int trainer_id,
diff --git a/paddle/phi/kernels/impl/activation_grad_impl.h b/paddle/phi/kernels/impl/activation_grad_impl.h
index 04391d2538c89..80dba29e76cbd 100644
--- a/paddle/phi/kernels/impl/activation_grad_impl.h
+++ b/paddle/phi/kernels/impl/activation_grad_impl.h
@@ -265,7 +265,7 @@ void SigmoidTripleGradKernel(const Context& dev_ctx,
                              const DenseTensor& dout,
                              const DenseTensor& ddx,
                              const DenseTensor& d_dout_new,
-                             paddle::optional<const DenseTensor&> d_ddout,
+                             const paddle::optional<DenseTensor>& d_ddout,
                              DenseTensor* d_out_new,
                              DenseTensor* d_dout,
                              DenseTensor* d_ddx) {
diff --git a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
index d5efd22a31daa..9956f07bf0b98 100644
--- a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
@@ -44,6 +44,10 @@ void AddmmGradKernel(const Context& dev_ctx,
                      DenseTensor* x_grad,
                      DenseTensor* y_grad) {
   auto in_dims = input.dims();
+  if (input.dims().size() == 1) {
+    in_dims = {1, input.dims()[0]};
+    input_grad->Resize(in_dims);
+  }
   int total_elems = 0;
 
   VLOG(3) << "alpha: " << alpha << " beta: " << beta;
@@ -85,6 +89,10 @@ void AddmmGradKernel(const Context& dev_ctx,
     }
 
     blas.SCAL(total_elems, beta, input_grad->data<T>());
+
+    if (input.dims().size() == 1) {
+      input_grad->Resize(input.dims());
+    }
   }
   if (x_grad) {
     dev_ctx.template Alloc<T>(x_grad);
diff --git a/paddle/phi/kernels/impl/addmm_kernel_impl.h b/paddle/phi/kernels/impl/addmm_kernel_impl.h
index f7afdfd622e63..3286e31f68923 100644
--- a/paddle/phi/kernels/impl/addmm_kernel_impl.h
+++ b/paddle/phi/kernels/impl/addmm_kernel_impl.h
@@ -44,6 +44,12 @@ void AddmmKernel(const Context& dev_ctx,
   auto x_dims = x.dims();
   auto y_dims = y.dims();
 
+  DenseTensor input_2d(input);
+  if (input.dims().size() == 1) {
+    input_dims = {1, input.dims()[0]};
+    input_2d.Resize(input_dims);
+  }
+
   // broadcast mode check
   if (x_dims[0] != input_dims[0]) {
     PADDLE_ENFORCE_EQ(input_dims[0],
@@ -97,7 +103,8 @@ void AddmmKernel(const Context& dev_ctx,
   bcast_dims[1] = y_dims[1] / input_dims[1];
   VLOG(3) << "bcast_dims=[" << bcast_dims[0] << "," << bcast_dims[1] << "]";
   // broadcast using eigen
-  auto eigen_input = PhiEigenTensor<T, 2>::From(input);
+  const DenseTensor& const_ref_input = input_2d;
+  auto eigen_input = PhiEigenTensor<T, 2>::From(const_ref_input);
   auto eigen_out = PhiEigenTensor<T, 2>::From(*out);
   auto& place = *dev_ctx.eigen_device();
   funcs::EigenBroadcast<std::decay_t<decltype(place)>, T, 2>::Eval(
diff --git a/paddle/phi/kernels/impl/bilinear_tensor_product_kernel_impl.h b/paddle/phi/kernels/impl/bilinear_tensor_product_kernel_impl.h
index 3f30a4b958ebe..4a2e41532e9ff 100644
--- a/paddle/phi/kernels/impl/bilinear_tensor_product_kernel_impl.h
+++ b/paddle/phi/kernels/impl/bilinear_tensor_product_kernel_impl.h
@@ -26,7 +26,7 @@ void BilinearTensorProductKernel(const Context& ctx,
                                  const DenseTensor& x,
                                  const DenseTensor& y,
                                  const DenseTensor& weight,
-                                 paddle::optional<const DenseTensor&> bias,
+                                 const paddle::optional<DenseTensor>& bias,
                                  DenseTensor* out) {
   ctx.template Alloc<T>(out);
 
diff --git a/paddle/phi/kernels/impl/channel_shuffle_grad_kernel_impl.h b/paddle/phi/kernels/impl/channel_shuffle_grad_kernel_impl.h
index 26bee763eca52..b8406b9143103 100644
--- a/paddle/phi/kernels/impl/channel_shuffle_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/channel_shuffle_grad_kernel_impl.h
@@ -32,8 +32,8 @@ void ChannelShuffleGradKernel(const Context& dev_ctx,
   auto* dx = x_grad;
   dev_ctx.template Alloc<T>(dx);
   bool channel_last = (data_format == "NHWC");
-  auto do_dims = dout->dims();
-  auto dx_dims = dx->dims();
+  const auto& do_dims = dout->dims();
+  const auto& dx_dims = dx->dims();
 
   DenseTensor t(*dout);
   if (!channel_last) {
diff --git a/paddle/phi/kernels/impl/channel_shuffle_kernel_impl.h b/paddle/phi/kernels/impl/channel_shuffle_kernel_impl.h
index c723cd3622af9..7e31e02851591 100644
--- a/paddle/phi/kernels/impl/channel_shuffle_kernel_impl.h
+++ b/paddle/phi/kernels/impl/channel_shuffle_kernel_impl.h
@@ -31,8 +31,8 @@ void ChannelShuffleKernel(const Context& dev_ctx,
   auto* in = &x;
   dev_ctx.template Alloc<T>(out);
   bool channel_last = (data_format == "NHWC");
-  auto in_dims = in->dims();
-  auto o_dims = out->dims();
+  const auto& in_dims = in->dims();
+  const auto& o_dims = out->dims();
 
   DenseTensor t(*in);
   if (!channel_last) {
diff --git a/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h
index 64306bc827e4b..512b1529f9191 100644
--- a/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h
@@ -29,8 +29,8 @@ void ConvGradGradKernel(const Context& dev_ctx,
                         const DenseTensor& input,
                         const DenseTensor& filter,
                         const DenseTensor& out_grad,
-                        paddle::optional<const DenseTensor&> input_grad_grad,
-                        paddle::optional<const DenseTensor&> filter_grad_grad,
+                        const paddle::optional<DenseTensor>& input_grad_grad,
+                        const paddle::optional<DenseTensor>& filter_grad_grad,
                         const std::vector<int>& strides_t,
                         const std::vector<int>& paddings_t,
                         const std::string& padding_algorithm,
diff --git a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
index 8d8e66a02f5fb..744c48b2bfbd6 100644
--- a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
@@ -163,7 +163,7 @@ void DeformableConvGradKernel(const Context& dev_ctx,
                               const DenseTensor& x,
                               const DenseTensor& offset,
                               const DenseTensor& filter,
-                              paddle::optional<const DenseTensor&> mask,
+                              const paddle::optional<DenseTensor>& mask,
                               const DenseTensor& out_grad,
                               const std::vector<int>& strides,
                               const std::vector<int>& paddings,
diff --git a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
index 6c0457024ddc4..f864c2e5f0ed0 100644
--- a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
+++ b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
@@ -28,7 +28,7 @@ void DeformableConvKernel(const Context& dev_ctx,
                           const DenseTensor& x,
                           const DenseTensor& offset,
                           const DenseTensor& filter,
-                          paddle::optional<const DenseTensor&> mask,
+                          const paddle::optional<DenseTensor>& mask,
                           const std::vector<int>& strides,
                           const std::vector<int>& paddings,
                           const std::vector<int>& dilations,
diff --git a/paddle/phi/kernels/impl/einsum_grad_impl.h b/paddle/phi/kernels/impl/einsum_grad_impl.h
index 2b087f8dcae09..a72db326807f8 100644
--- a/paddle/phi/kernels/impl/einsum_grad_impl.h
+++ b/paddle/phi/kernels/impl/einsum_grad_impl.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 #pragma once
 
+#include "paddle/fluid/platform/profiler.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/impl/einsum_impl.h"
 #include "paddle/phi/kernels/tile_kernel.h"
@@ -55,7 +56,13 @@ DenseTensor PerformTileAndReduction(const Context& dev_ctx,
   }
   t.Resize(make_ddim(resize_dims));
   DenseTensor after_tile;
-  TileKernel<T, Context>(dev_ctx, t, repeat_times, &after_tile);
+  if (std::all_of(repeat_times.begin(), repeat_times.end(), [](int x) {
+        return x == 1;
+      })) {
+    after_tile = t;
+  } else {
+    TileKernel<T, Context>(dev_ctx, t, repeat_times, &after_tile);
+  }
   size_t n_ellipsis_idx = op_label.find(".", 0);
   if (n_ellipsis_idx != std::string::npos) {
     // may be we need reduce. broadcast_dims is not equal to ellipsis dims.
@@ -91,10 +98,11 @@ DenseTensor PerformTileAndReduction(const Context& dev_ctx,
 template <typename T, typename Context>
 void EinsumGradKernel(const Context& dev_ctx,
                       const std::vector<const DenseTensor*>& x,
+                      const std::vector<const DenseTensor*>& inner_cache,
                       const DenseTensor& out_grad,
                       const std::string& equation,
                       std::vector<DenseTensor*> x_grad) {
-  VLOG(5) << "Start EisumGradKernel:";
+  VLOG(5) << "Start EinsumGradKernel:";
   LabelMap labelshape(0);
   LabelMap labeltype(LabelType::Reduction);
   std::vector<LabelMap> label2perms(x.size(), LabelMap(-1));
@@ -162,22 +170,51 @@ void EinsumGradKernel(const Context& dev_ctx,
     operands_for_B.push_back(x[0]);
 
     DenseTensor before_tile;
-    EinsumKernel<T, Context>(dev_ctx, operands_for_A, equation_for_A, &dA);
-    EinsumKernel<T, Context>(dev_ctx, operands_for_B, equation_for_B, &dB);
-    *(x_grad[0]) = PerformTileAndReduction<T, Context>(dev_ctx,
-                                                       labeltype,
-                                                       labelshape,
-                                                       broadcast_dims,
-                                                       ellipsis_dims[0],
-                                                       ops[0],
-                                                       dA);
-    *(x_grad[1]) = PerformTileAndReduction<T, Context>(dev_ctx,
-                                                       labeltype,
-                                                       labelshape,
-                                                       broadcast_dims,
-                                                       ellipsis_dims[1],
-                                                       ops[1],
-                                                       dB);
+
+    std::vector<DenseTensor> cache(3);  // set empty; TA, TB, TdC
+    if (inner_cache.size() >
+        0) {  // for compatibility,  we can load and run v2.3 EinsumOp.
+      cache[0].ShareBufferWith(*(inner_cache[0]));
+      cache[1].ShareBufferWith(*(inner_cache[1]));
+    }
+
+    EinsumKernelImpl<T, Context>(dev_ctx,
+                                 all_labels,
+                                 operands_for_A,
+                                 equation_for_A,
+                                 &dA,
+                                 {&cache[1], &cache[2]},
+                                 false);
+
+    EinsumKernelImpl<T, Context>(dev_ctx,
+                                 all_labels,
+                                 operands_for_B,
+                                 equation_for_B,
+                                 &dB,
+                                 {&cache[2], &cache[0]},
+                                 false);
+
+    // release the cache tensor dTC to save memory right now. they are useless
+    // now.
+    cache.clear();
+    if (x_grad[0]) {
+      *(x_grad[0]) = PerformTileAndReduction<T, Context>(dev_ctx,
+                                                         labeltype,
+                                                         labelshape,
+                                                         broadcast_dims,
+                                                         ellipsis_dims[0],
+                                                         ops[0],
+                                                         dA);
+    }
+    if (x_grad[1]) {
+      *(x_grad[1]) = PerformTileAndReduction<T, Context>(dev_ctx,
+                                                         labeltype,
+                                                         labelshape,
+                                                         broadcast_dims,
+                                                         ellipsis_dims[1],
+                                                         ops[1],
+                                                         dB);
+    }
   }
 }
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/einsum_impl.h b/paddle/phi/kernels/impl/einsum_impl.h
index 901147734b29f..bfbd6e0c51cfc 100644
--- a/paddle/phi/kernels/impl/einsum_impl.h
+++ b/paddle/phi/kernels/impl/einsum_impl.h
@@ -20,6 +20,8 @@
 #include "paddle/phi/kernels/transpose_kernel.h"
 #include "paddle/utils/string/string_helper.h"
 
+DECLARE_bool(einsum_opt);
+
 namespace phi {
 
 // check the validation of the Einsum equation.
@@ -137,7 +139,6 @@ inline std::vector<char> TransformLabelsOrder(
     std::vector<char> tmp;
     for (int c : all_labels) {
       if (type[c] == cnt_type) tmp.push_back(c);
-      std::sort(tmp.begin(), tmp.end());
     }
     ret.insert(ret.end(), tmp.begin(), tmp.end());
   }
@@ -176,6 +177,15 @@ inline static void GlobalInfo(const std::vector<std::string>& op_labels,
 
   (*label2type)['.'] = LabelType::Batch;
 
+  if (sorted_labels->size()) {
+    std::set<char> exist(all.begin(), all.end());
+    all.clear();
+    std::for_each(
+        sorted_labels->begin(), sorted_labels->end(), [&exist, &all](char c) {
+          if (exist.count(c)) all.push_back(c);
+        });
+  }
+
   *sorted_labels = TransformLabelsOrder(all,
                                         *label2type,
                                         {LabelType::Batch,
@@ -409,7 +419,8 @@ DenseTensor PerformContraction(
     const LabelMap& label2shape,
     const std::vector<std::vector<int>>& ellipsis_dims,
     const std::vector<int>& broadcast_dims,
-    std::vector<DenseTensor*> cache) {
+    std::vector<DenseTensor*> cache,
+    bool use_cache) {
   // Get All the Batches, so perm is
   auto all_valid = LabelMap(1);
   auto recover_dim = GetShapeByType<int>(all_labels,
@@ -447,14 +458,17 @@ DenseTensor PerformContraction(
     }
     // reduction
     DenseTensor trans_t;
-    if (cache[operand_idx]->IsInitialized()) {
+    if (FLAGS_einsum_opt && use_cache && cache[operand_idx] != nullptr &&
+        cache[operand_idx]->IsInitialized()) {
       trans_t.ShareBufferWith(*(cache[operand_idx]));
+      VLOG(5) << "Cache Used!";
     } else {
       auto reduct_t = PerformReduction<T, Context>(
           dev_ctx, t, perm, all_labels, ellipsis, label2type);
       trans_t = PerformTranspose<T, Context>(
           dev_ctx, reduct_t, perm, reordered_all_labels, ellipsis, label2type);
-      cache[operand_idx]->ShareBufferWith(trans_t);
+      if (FLAGS_einsum_opt && cache[operand_idx] != nullptr)
+        cache[operand_idx]->ShareBufferWith(trans_t);
     }
     auto mul_dims = GetShapeByType<int>(all_labels,
                                         label2type,
@@ -515,18 +529,23 @@ void TransposeToOutput(const Context& dev_ctx,
       axis.push_back(it - all_labels.begin() + offset);
     }
   }
-  if (is_no_need_transpose(axis)) return output->ShareBufferWith(to_trans);
+  if (is_no_need_transpose(axis)) {
+    output->ShareBufferWith(to_trans);
+    return;
+  }
   VLOG(5) << "call TransposeToOutput: with axis: "
           << paddle::string::join_strings(axis, ",");
-  return TransposeKernel<T, Context>(dev_ctx, to_trans, axis, output);
+  TransposeKernel<T, Context>(dev_ctx, to_trans, axis, output);
 }
 
 template <typename T, typename Context>
 void EinsumKernelImpl(const Context& dev_ctx,
+                      const std::vector<char>& forward_all_labels,
                       const std::vector<const DenseTensor*>& inputs,
                       const std::string& equation,
                       DenseTensor* out,
-                      std::vector<DenseTensor*> cache) {
+                      std::vector<DenseTensor*> cache,
+                      bool is_forward = true) {
   ValidationCheck(equation);
   // collect the following informations to prepare einsum.
   LabelMap labelshape(0);
@@ -542,6 +561,9 @@ void EinsumKernelImpl(const Context& dev_ctx,
     input_dims.push_back(i->dims());
   }
   std::string right;
+  if (!is_forward) {
+    all_labels = forward_all_labels;
+  }
   ParseEinsumEquation(equation,
                       input_dims,
                       &labelshape,
@@ -557,7 +579,6 @@ void EinsumKernelImpl(const Context& dev_ctx,
     auto& A = inputs[0];
     auto& B = inputs[1];
     // Reduction and Contract Procedure
-    dev_ctx.template Alloc<T>(out);
     auto after_contraction = PerformContraction<T, Context>(dev_ctx,
                                                             *A,
                                                             *B,
@@ -567,7 +588,8 @@ void EinsumKernelImpl(const Context& dev_ctx,
                                                             labelshape,
                                                             ellipsis_dims,
                                                             broadcast_dims,
-                                                            cache);
+                                                            cache,
+                                                            !is_forward);
     TransposeToOutput<T, Context>(dev_ctx,
                                   after_contraction,
                                   right,
@@ -599,18 +621,37 @@ void EinsumKernelImpl(const Context& dev_ctx,
   }
 }
 
+template <typename T, typename Context>
+void EinsumKernelRaw(const Context& dev_ctx,
+                     const std::vector<const DenseTensor*>& inputs,
+                     const std::string& equation,
+                     DenseTensor* out,
+                     std::vector<DenseTensor*> cache) {
+  std::vector<char> tmp;
+  // for the sake of compatibility, we may load and run v2.3 EinsumOp. Output
+  // may have nullptr and the cache.size() is not equal to inputs.size(). refer
+  // to BuildPhiKernelContext for details.
+  int diff = inputs.size() - cache.size();
+  for (int i = 0; i < diff; ++i) {
+    cache.push_back(nullptr);
+  }
+  EinsumKernelImpl<T, Context>(
+      dev_ctx, tmp, inputs, equation, out, cache, /*forward=*/true);
+}
+
 template <typename T, typename Context>
 void EinsumKernel(const Context& dev_ctx,
                   const std::vector<const DenseTensor*>& inputs,
                   const std::string& equation,
                   DenseTensor* out) {
-  std::vector<DenseTensor> cache(inputs.size());  // set empty; TA, TB, TdC
+  std::vector<char> place_holder;
   std::vector<DenseTensor*> cache_tensor(
       inputs.size());  // set empty; TA, TB, TdC
   for (size_t i = 0; i < inputs.size(); ++i) {
-    cache_tensor[i] = &cache[i];
+    cache_tensor[i] = nullptr;
   }
-  EinsumKernelImpl<T, Context>(dev_ctx, inputs, equation, out, cache_tensor);
+  EinsumKernelImpl<T, Context>(
+      dev_ctx, place_holder, inputs, equation, out, cache_tensor, true);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index 3c06b238d145c..73935640e349b 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -53,8 +53,8 @@ void AddGradImpl(const Context& dev_ctx,
 template <typename T, typename Context>
 void AddDoubleGradImpl(const Context& dev_ctx,
                        const DenseTensor& y,
-                       const paddle::optional<const DenseTensor&>& ddx,
-                       const paddle::optional<const DenseTensor&>& ddy,
+                       const paddle::optional<DenseTensor>& ddx,
+                       const paddle::optional<DenseTensor>& ddy,
                        const DenseTensor& dout,
                        int axis,
                        DenseTensor* ddout) {
@@ -87,8 +87,8 @@ void AddDoubleGradImpl(const Context& dev_ctx,
 template <typename T, typename Context>
 void SubtractDoubleGradImpl(const Context& dev_ctx,
                             const DenseTensor& y,
-                            const paddle::optional<const DenseTensor&>& ddx,
-                            const paddle::optional<const DenseTensor&>& ddy,
+                            const paddle::optional<DenseTensor>& ddx,
+                            const paddle::optional<DenseTensor>& ddy,
                             const DenseTensor& dout,
                             int axis,
                             DenseTensor* ddout) {
@@ -160,8 +160,8 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
                             const DenseTensor& y,
                             const DenseTensor& out,
                             const DenseTensor& dx,
-                            paddle::optional<const DenseTensor&> ddx,
-                            paddle::optional<const DenseTensor&> ddy,
+                            const paddle::optional<DenseTensor>& ddx,
+                            const paddle::optional<DenseTensor>& ddy,
                             int axis,
                             DenseTensor* dy,
                             DenseTensor* dout,
@@ -416,8 +416,8 @@ void MultiplyDoubleGradKernel(const Context& dev_ctx,
                               const DenseTensor& x,
                               const DenseTensor& y,
                               const DenseTensor& dout,
-                              paddle::optional<const DenseTensor&> ddx,
-                              paddle::optional<const DenseTensor&> ddy,
+                              const paddle::optional<DenseTensor>& ddx,
+                              const paddle::optional<DenseTensor>& ddy,
                               int axis,
                               DenseTensor* dx,
                               DenseTensor* dy,
@@ -535,11 +535,11 @@ void MultiplyTripleGradKernel(const Context& dev_ctx,
                               const DenseTensor& x,
                               const DenseTensor& y,
                               const DenseTensor& dout,
-                              paddle::optional<const DenseTensor&> ddx,
-                              paddle::optional<const DenseTensor&> ddy,
+                              const paddle::optional<DenseTensor>& ddx,
+                              const paddle::optional<DenseTensor>& ddy,
                               const DenseTensor& d_dx,
                               const DenseTensor& d_dy,
-                              paddle::optional<const DenseTensor&> d_ddout,
+                              const paddle::optional<DenseTensor>& d_ddout,
                               int axis,
                               DenseTensor* d_x,
                               DenseTensor* d_y,
diff --git a/paddle/phi/kernels/impl/expand_as_kernel_impl.h b/paddle/phi/kernels/impl/expand_as_kernel_impl.h
index e5138e4e12c05..a5661aaa2ac16 100644
--- a/paddle/phi/kernels/impl/expand_as_kernel_impl.h
+++ b/paddle/phi/kernels/impl/expand_as_kernel_impl.h
@@ -93,7 +93,7 @@ void ExpandAs(const Context& context,
 template <typename T, typename Context>
 void ExpandAsKernel(const Context& ctx,
                     const DenseTensor& x,
-                    paddle::optional<const DenseTensor&> y,
+                    const paddle::optional<DenseTensor>& y,
                     const std::vector<int>& target_shape,
                     DenseTensor* out) {
   auto rank = x.dims().size();
diff --git a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
index 25a9db868d357..5641e7a8274f3 100644
--- a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
@@ -473,8 +473,8 @@ void MatmulDoubleGradKernel(const Context& dev_ctx,
                             const DenseTensor& x,
                             const DenseTensor& y,
                             const DenseTensor& dout,
-                            paddle::optional<const DenseTensor&> ddx,
-                            paddle::optional<const DenseTensor&> ddy,
+                            const paddle::optional<DenseTensor>& ddx,
+                            const paddle::optional<DenseTensor>& ddy,
                             bool transpose_x,
                             bool transpose_y,
                             DenseTensor* dx,
@@ -854,9 +854,9 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
                             const DenseTensor& dout,
                             const DenseTensor& ddx,
                             const DenseTensor& ddy,
-                            paddle::optional<const DenseTensor&> d_dx,
-                            paddle::optional<const DenseTensor&> d_dy,
-                            paddle::optional<const DenseTensor&> d_ddout,
+                            const paddle::optional<DenseTensor>& d_dx,
+                            const paddle::optional<DenseTensor>& d_dy,
+                            const paddle::optional<DenseTensor>& d_ddout,
                             bool transpose_x,
                             bool transpose_y,
                             DenseTensor* out_d_x,
@@ -1790,8 +1790,8 @@ void MatmulWithFlattenDoubleGradKernel(
     const DenseTensor& x,
     const DenseTensor& y,
     const DenseTensor& out_grad,
-    paddle::optional<const DenseTensor&> x_grad_grad,
-    paddle::optional<const DenseTensor&> y_grad_grad,
+    const paddle::optional<DenseTensor>& x_grad_grad,
+    const paddle::optional<DenseTensor>& y_grad_grad,
     int x_num_col_dims,
     int y_num_col_dims,
     DenseTensor* x_grad,
diff --git a/paddle/phi/kernels/impl/momentum_kernel_impl.h b/paddle/phi/kernels/impl/momentum_kernel_impl.h
index 3aca225ad403b..825a3b9d56990 100644
--- a/paddle/phi/kernels/impl/momentum_kernel_impl.h
+++ b/paddle/phi/kernels/impl/momentum_kernel_impl.h
@@ -408,7 +408,7 @@ void MomentumDenseImpl(const Context& ctx,
                        const DenseTensor& grad,
                        const DenseTensor& velocity,
                        const DenseTensor& learning_rate,
-                       paddle::optional<const DenseTensor&> master_param_opt,
+                       const paddle::optional<DenseTensor>& master_param_opt,
                        float mu_t,
                        bool use_nesterov,
                        const std::string& regularization_method,
@@ -500,7 +500,7 @@ void MomentumSparseImpl(const Context& ctx,
                         const SelectedRows& grad,
                         const DenseTensor& velocity,
                         const DenseTensor& learning_rate,
-                        paddle::optional<const DenseTensor&> master_param_opt,
+                        const paddle::optional<DenseTensor>& master_param_opt,
                         float mu_t,
                         bool use_nesterov,
                         const std::string& regularization_method,
@@ -602,7 +602,7 @@ void MomentumDenseKernel(const Context& dev_ctx,
                          const DenseTensor& grad,
                          const DenseTensor& velocity,
                          const DenseTensor& learning_rate,
-                         paddle::optional<const DenseTensor&> master_param,
+                         const paddle::optional<DenseTensor>& master_param,
                          float mu,
                          bool use_nesterov,
                          const std::string& regularization_method,
@@ -654,7 +654,7 @@ void MomentumSparseKernel(const Context& dev_ctx,
                           const SelectedRows& grad,
                           const DenseTensor& velocity,
                           const DenseTensor& learning_rate,
-                          paddle::optional<const DenseTensor&> master_param,
+                          const paddle::optional<DenseTensor>& master_param,
                           float mu,
                           bool use_nesterov,
                           const std::string& regularization_method,
diff --git a/paddle/phi/kernels/impl/mv_kernel_impl.h b/paddle/phi/kernels/impl/mv_kernel_impl.h
index 1754ea323ceb9..4baee25a0993a 100644
--- a/paddle/phi/kernels/impl/mv_kernel_impl.h
+++ b/paddle/phi/kernels/impl/mv_kernel_impl.h
@@ -23,7 +23,7 @@ void MvKernel(const Context& dev_ctx,
               const DenseTensor& x,
               const DenseTensor& vec,
               DenseTensor* out) {
-  auto dim_x = x.dims();
+  const auto& dim_x = x.dims();
 
   // get data ptr
   const T* x_data = x.data<T>();
diff --git a/paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h b/paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h
index db19a04337932..f71f6cd990aa1 100644
--- a/paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h
@@ -32,8 +32,8 @@ void PixelShuffleGradKernel(const Context& ctx,
   ctx.template Alloc<T>(dx);
   int factor = upscale_factor;
   bool channel_last = (data_format == "NHWC");
-  auto do_dims = dout->dims();
-  auto dx_dims = dx->dims();
+  const auto& do_dims = dout->dims();
+  const auto& dx_dims = dx->dims();
 
   DenseTensor t(*dout);
   if (!channel_last) {
diff --git a/paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h b/paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h
index 2303db4ea57d6..c5e41b4902951 100644
--- a/paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h
+++ b/paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h
@@ -31,8 +31,8 @@ void PixelShuffleKernel(const Context& ctx,
   ctx.template Alloc<T>(out);
   int factor = upscale_factor;
   bool channel_last = (data_format == "NHWC");
-  auto in_dims = in->dims();
-  auto o_dims = out->dims();
+  const auto& in_dims = in->dims();
+  const auto& o_dims = out->dims();
 
   DenseTensor t(*in);
   if (!channel_last) {
diff --git a/paddle/phi/kernels/impl/pixel_unshuffle_grad_kernel_impl.h b/paddle/phi/kernels/impl/pixel_unshuffle_grad_kernel_impl.h
index cb02539f2e890..399c6a56727e2 100644
--- a/paddle/phi/kernels/impl/pixel_unshuffle_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/pixel_unshuffle_grad_kernel_impl.h
@@ -33,8 +33,8 @@ void PixelUnshuffleGradKernel(const Context& dev_ctx,
   dev_ctx.template Alloc<T>(dx);
   int factor = downscale_factor;
   bool channel_last = (data_format == "NHWC");
-  auto do_dims = dout->dims();
-  auto dx_dims = dx->dims();
+  const auto& do_dims = dout->dims();
+  const auto& dx_dims = dx->dims();
 
   DenseTensor t(*dout);
   if (!channel_last) {
diff --git a/paddle/phi/kernels/impl/pixel_unshuffle_kernel_impl.h b/paddle/phi/kernels/impl/pixel_unshuffle_kernel_impl.h
index 0a140b270ba1b..7ffce62dacf65 100644
--- a/paddle/phi/kernels/impl/pixel_unshuffle_kernel_impl.h
+++ b/paddle/phi/kernels/impl/pixel_unshuffle_kernel_impl.h
@@ -32,8 +32,8 @@ void PixelUnshuffleKernel(const Context& dev_ctx,
   dev_ctx.template Alloc<T>(out);
   int factor = downscale_factor;
   bool channel_last = (data_format == "NHWC");
-  auto in_dims = in->dims();
-  auto o_dims = out->dims();
+  const auto& in_dims = in->dims();
+  const auto& o_dims = out->dims();
 
   DenseTensor t(*in);
   if (!channel_last) {
diff --git a/paddle/phi/kernels/impl/rmsprop_kernel_impl.h b/paddle/phi/kernels/impl/rmsprop_kernel_impl.h
index 64b12837074dd..1954c5f20db3e 100644
--- a/paddle/phi/kernels/impl/rmsprop_kernel_impl.h
+++ b/paddle/phi/kernels/impl/rmsprop_kernel_impl.h
@@ -146,7 +146,7 @@ void RmspropDenseKernel(const Context &ctx,
                         const DenseTensor &grad,
                         const DenseTensor &moment,
                         const DenseTensor &learning_rate,
-                        paddle::optional<const DenseTensor &> mean_grad_opt,
+                        const paddle::optional<DenseTensor> &mean_grad_opt,
                         float epsilon_t,
                         float decay_t,
                         float momentum_t,
@@ -196,11 +196,19 @@ void RmspropDenseKernel(const Context &ctx,
     if (centered) {
       auto mg_tensor = mean_grad_opt.get_ptr();
       auto mg = EigenVector<T>::Flatten(*mg_tensor);
-      PADDLE_ENFORCE_EQ(
-          mg_tensor,
-          mean_grad_out,
-          phi::errors::InvalidArgument(
-              "MeanGrad and MeanGradOut must be the same Tensor"));
+      if (mg_tensor) {
+        PADDLE_ENFORCE_EQ(
+            mg_tensor->Holder(),
+            mean_grad_out->Holder(),
+            phi::errors::InvalidArgument(
+                "MeanGrad and MeanGradOut must be the same Tensor"));
+      } else {
+        PADDLE_ENFORCE_EQ(
+            mg_tensor,
+            mean_grad_out,
+            phi::errors::InvalidArgument(
+                "MeanGrad and MeanGradOut must be the same Tensor"));
+      }
       auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
 
       mg_out.device(place) = rho * mg + (1 - rho) * g;
@@ -217,12 +225,20 @@ void RmspropDenseKernel(const Context &ctx,
     funcs::ForRange<Context> for_range(ctx, limit);
     if (centered) {
       auto mg_tensor = mean_grad_opt.get_ptr();
+      if (mg_tensor) {
+        PADDLE_ENFORCE_EQ(
+            mg_tensor->Holder(),
+            mean_grad_out->Holder(),
+            phi::errors::InvalidArgument(
+                "MeanGrad and MeanGradOut must be the same Tensor"));
+      } else {
+        PADDLE_ENFORCE_EQ(
+            mg_tensor,
+            mean_grad_out,
+            phi::errors::InvalidArgument(
+                "MeanGrad and MeanGradOut must be the same Tensor"));
+      }
 
-      PADDLE_ENFORCE_EQ(
-          mg_tensor,
-          mean_grad_out,
-          phi::errors::InvalidArgument(
-              "MeanGrad and MeanGradOut must be the same Tensor"));
       for_range(CenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>(
           ctx.template Alloc<T>(param_out),
           ctx.template Alloc<T>(mean_square_out),
@@ -254,7 +270,7 @@ void RmspropSparseKernel(const Context &ctx,
                          const SelectedRows &grad,
                          const DenseTensor &moment,
                          const DenseTensor &learning_rate,
-                         paddle::optional<const DenseTensor &> mean_grad_opt,
+                         const paddle::optional<DenseTensor> &mean_grad_opt,
                          float epsilon_t,
                          float decay_t,
                          float momentum_t,
@@ -305,11 +321,20 @@ void RmspropSparseKernel(const Context &ctx,
 
   if (centered) {
     auto mg_tensor = mean_grad_opt.get_ptr();
+    if (mg_tensor) {
+      PADDLE_ENFORCE_EQ(
+          mg_tensor->Holder(),
+          mean_grad_out->Holder(),
+          phi::errors::InvalidArgument(
+              "MeanGrad and MeanGradOut must be the same Tensor"));
+    } else {
+      PADDLE_ENFORCE_EQ(
+          mg_tensor,
+          mean_grad_out,
+          phi::errors::InvalidArgument(
+              "MeanGrad and MeanGradOut must be the same Tensor"));
+    }
 
-    PADDLE_ENFORCE_EQ(mg_tensor,
-                      mean_grad_out,
-                      phi::errors::InvalidArgument(
-                          "MeanGrad and MeanGradOut must be the same Tensor"));
     for_range(CenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>(
         ctx.template Alloc<T>(param_out),
         ctx.template Alloc<T>(mean_square_out),
diff --git a/paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h b/paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h
index 4ba1a0c6b6c0f..bd0ba26b99a43 100644
--- a/paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h
@@ -27,7 +27,7 @@ void SegmentPoolGradKernel(const Context& dev_ctx,
                            const DenseTensor& x,
                            const DenseTensor& segment_ids,
                            const DenseTensor& out,
-                           paddle::optional<const DenseTensor&> summed_ids,
+                           const paddle::optional<DenseTensor>& summed_ids,
                            const DenseTensor& out_grad,
                            const std::string& pooltype,
                            DenseTensor* x_grad) {
diff --git a/paddle/phi/kernels/impl/unfold_grad_kernel_impl.h b/paddle/phi/kernels/impl/unfold_grad_kernel_impl.h
index 5556654ee7c0d..0724cffdd4448 100644
--- a/paddle/phi/kernels/impl/unfold_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/unfold_grad_kernel_impl.h
@@ -35,7 +35,7 @@ void UnfoldGradKernel(const Context& ctx,
 
   if (!x_grad) return;
 
-  auto x_dims = x_grad->dims();
+  const auto& x_dims = x_grad->dims();
   const int batch_size = static_cast<int>(x_dims[0]);
 
   int out_height = phi::funcs::CalcOutputSize(x_dims[2],
diff --git a/paddle/phi/kernels/impl/unfold_kernel_impl.h b/paddle/phi/kernels/impl/unfold_kernel_impl.h
index e914f6cacbde9..4526d1c3dcd7d 100644
--- a/paddle/phi/kernels/impl/unfold_kernel_impl.h
+++ b/paddle/phi/kernels/impl/unfold_kernel_impl.h
@@ -36,7 +36,7 @@ void UnfoldKernel(const Context& ctx,
   paddle::operators::math::
       Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
           im2col;
-  auto x_dims = x.dims();
+  const auto& x_dims = x.dims();
 
   int out_height = phi::funcs::CalcOutputSize(x_dims[2],
                                               kernel_sizes[0],
diff --git a/paddle/phi/kernels/impl/warpctc_grad_kernel_impl.h b/paddle/phi/kernels/impl/warpctc_grad_kernel_impl.h
index b788c966a1af1..b07628c981476 100644
--- a/paddle/phi/kernels/impl/warpctc_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/warpctc_grad_kernel_impl.h
@@ -32,7 +32,7 @@ void WarpctcGradKernel(const Context& dev_ctx,
                        const DenseTensor& warpctc_grad,
                        const DenseTensor& logits,
                        const DenseTensor& loss_grad,
-                       const paddle::optional<const DenseTensor&> logits_length,
+                       const paddle::optional<DenseTensor>& logits_length,
                        int blank,
                        bool norm_by_times,
                        DenseTensor* logits_grad) {
diff --git a/paddle/phi/kernels/impl/warpctc_kernel_impl.h b/paddle/phi/kernels/impl/warpctc_kernel_impl.h
index ef6be7a9dfa88..6c792507c6f77 100644
--- a/paddle/phi/kernels/impl/warpctc_kernel_impl.h
+++ b/paddle/phi/kernels/impl/warpctc_kernel_impl.h
@@ -229,8 +229,8 @@ template <typename T, typename Context>
 void WarpctcKernel(const Context& dev_ctx,
                    const DenseTensor& logits,
                    const DenseTensor& label,
-                   const paddle::optional<const DenseTensor&> logits_length,
-                   const paddle::optional<const DenseTensor&> labels_length,
+                   const paddle::optional<DenseTensor>& logits_length,
+                   const paddle::optional<DenseTensor>& labels_length,
                    int blank,
                    bool norm_by_times,
                    DenseTensor* warpctc_grad,
diff --git a/paddle/phi/kernels/instance_norm_grad_kernel.h b/paddle/phi/kernels/instance_norm_grad_kernel.h
new file mode 100644
index 0000000000000..be7e4ce3e3488
--- /dev/null
+++ b/paddle/phi/kernels/instance_norm_grad_kernel.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void InstanceNormGradKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& y_grad,
+                            const paddle::optional<DenseTensor>& scale,
+                            const DenseTensor& saved_mean,
+                            const DenseTensor& saved_variance,
+                            float epsilon,
+                            DenseTensor* x_grad,
+                            DenseTensor* scale_grad,
+                            DenseTensor* bias_grad);
+
+template <typename T, typename Context>
+void InstanceNormDoubleGradKernel(const Context& dev_ctx,
+                                  const DenseTensor& x,
+                                  const paddle::optional<DenseTensor>& scale,
+                                  const DenseTensor& saved_mean,
+                                  const DenseTensor& saved_variance,
+                                  const DenseTensor& dy,
+                                  const paddle::optional<DenseTensor>& ddx,
+                                  const paddle::optional<DenseTensor>& ddscale,
+                                  const paddle::optional<DenseTensor>& ddbias,
+                                  float epsilon,
+                                  DenseTensor* dx,
+                                  DenseTensor* dscale,
+                                  DenseTensor* ddy);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/instance_norm_kernel.h b/paddle/phi/kernels/instance_norm_kernel.h
new file mode 100644
index 0000000000000..f8f1bbe1287a2
--- /dev/null
+++ b/paddle/phi/kernels/instance_norm_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void InstanceNormKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const paddle::optional<DenseTensor>& scale,
+                        const paddle::optional<DenseTensor>& bias,
+                        float epsilon,
+                        DenseTensor* y,
+                        DenseTensor* saved_mean,
+                        DenseTensor* saved_variance);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/interpolate_grad_kernel.h b/paddle/phi/kernels/interpolate_grad_kernel.h
index 59d2dddd87007..b8eefad61a768 100644
--- a/paddle/phi/kernels/interpolate_grad_kernel.h
+++ b/paddle/phi/kernels/interpolate_grad_kernel.h
@@ -22,9 +22,9 @@ template <typename T, typename Context>
 void BilinearInterpGradKernel(
     const Context& ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& out_grad,
     const std::string& data_layout,
     int out_d,
diff --git a/paddle/phi/kernels/interpolate_kernel.h b/paddle/phi/kernels/interpolate_kernel.h
index 4623657f5a594..c531461c12e29 100644
--- a/paddle/phi/kernels/interpolate_kernel.h
+++ b/paddle/phi/kernels/interpolate_kernel.h
@@ -22,9 +22,9 @@ template <typename T, typename Context>
 void BilinearInterpKernel(
     const Context& ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout,
     int out_d,
     int out_h,
@@ -39,10 +39,9 @@ template <typename T, typename Context>
 void NearestInterpKernel(
     const Context& ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout,
     int out_d,
     int out_h,
@@ -57,10 +56,9 @@ template <typename T, typename Context>
 void TrilinearInterpKernel(
     const Context& ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout,
     int out_d,
     int out_h,
@@ -75,10 +73,9 @@ template <typename T, typename Context>
 void LinearInterpKernel(
     const Context& ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout,
     int out_d,
     int out_h,
@@ -93,10 +90,9 @@ template <typename T, typename Context>
 void BicubicInterpKernel(
     const Context& ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout,
     int out_d,
     int out_h,
diff --git a/paddle/phi/kernels/kps/elementwise_add_kernel.cu b/paddle/phi/kernels/kps/elementwise_add_kernel.cu
index 8f7d45771d9d0..98e39ada32b8b 100644
--- a/paddle/phi/kernels/kps/elementwise_add_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_add_kernel.cu
@@ -33,6 +33,14 @@ void AddKernel(const Context& dev_ctx,
   AddRawKernel<T>(dev_ctx, x, y, axis, out);
 }
 
+template <typename T, typename Context>
+void GradAddKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   DenseTensor* out) {
+  AddKernel<T>(dev_ctx, x, y, out);
+}
+
 }  // namespace phi
 
 #ifdef PADDLE_WITH_XPU_KP
@@ -71,4 +79,18 @@ PD_REGISTER_KERNEL(add,
                    phi::dtype::bfloat16,
                    complex64,
                    complex128) {}
+
+PD_REGISTER_KERNEL(grad_add,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::GradAddKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   complex64,
+                   complex128) {}
 #endif
diff --git a/paddle/phi/kernels/label_smooth_kernel.h b/paddle/phi/kernels/label_smooth_kernel.h
index b7e1f2708894c..2db35e1bff346 100644
--- a/paddle/phi/kernels/label_smooth_kernel.h
+++ b/paddle/phi/kernels/label_smooth_kernel.h
@@ -23,7 +23,7 @@ namespace phi {
 template <typename T, typename Context>
 void LabelSmoothKernel(const Context& ctx,
                        const DenseTensor& label,
-                       paddle::optional<const DenseTensor&> prior_dist,
+                       const paddle::optional<DenseTensor>& prior_dist,
                        float epsilon,
                        DenseTensor* out);
 
diff --git a/paddle/phi/kernels/layer_norm_grad_kernel.h b/paddle/phi/kernels/layer_norm_grad_kernel.h
index 65f19a11b94d6..7d7cd13109be1 100644
--- a/paddle/phi/kernels/layer_norm_grad_kernel.h
+++ b/paddle/phi/kernels/layer_norm_grad_kernel.h
@@ -21,8 +21,8 @@ namespace phi {
 template <typename T, typename Context>
 void LayerNormGradKernel(const Context& ctx,
                          const DenseTensor& x,
-                         paddle::optional<const DenseTensor&> scale,
-                         paddle::optional<const DenseTensor&> bias,
+                         const paddle::optional<DenseTensor>& scale,
+                         const paddle::optional<DenseTensor>& bias,
                          const DenseTensor& mean,
                          const DenseTensor& variance,
                          const DenseTensor& out_grad,
diff --git a/paddle/phi/kernels/layer_norm_kernel.h b/paddle/phi/kernels/layer_norm_kernel.h
index c9679420bda5c..26c04b61af96b 100644
--- a/paddle/phi/kernels/layer_norm_kernel.h
+++ b/paddle/phi/kernels/layer_norm_kernel.h
@@ -22,8 +22,8 @@ namespace phi {
 template <typename T, typename Context>
 void LayerNormKernel(const Context& ctx,
                      const DenseTensor& x,
-                     paddle::optional<const DenseTensor&> scale,
-                     paddle::optional<const DenseTensor&> bias,
+                     const paddle::optional<DenseTensor>& scale,
+                     const paddle::optional<DenseTensor>& bias,
                      float epsilon,
                      int begin_norm_axis,
                      bool is_test,
diff --git a/paddle/phi/kernels/matmul_grad_kernel.h b/paddle/phi/kernels/matmul_grad_kernel.h
index 41a835db46f71..47c6acdcb3923 100644
--- a/paddle/phi/kernels/matmul_grad_kernel.h
+++ b/paddle/phi/kernels/matmul_grad_kernel.h
@@ -34,8 +34,8 @@ void MatmulDoubleGradKernel(const Context& dev_ctx,
                             const DenseTensor& x,
                             const DenseTensor& y,
                             const DenseTensor& dout,
-                            paddle::optional<const DenseTensor&> ddx,
-                            paddle::optional<const DenseTensor&> ddy,
+                            const paddle::optional<DenseTensor>& ddx,
+                            const paddle::optional<DenseTensor>& ddy,
                             bool transpose_x,
                             bool transpose_y,
                             DenseTensor* dx,
@@ -49,9 +49,9 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
                             const DenseTensor& dout,
                             const DenseTensor& ddx,
                             const DenseTensor& ddy,
-                            paddle::optional<const DenseTensor&> d_dx,
-                            paddle::optional<const DenseTensor&> d_dy,
-                            paddle::optional<const DenseTensor&> d_ddout,
+                            const paddle::optional<DenseTensor>& d_dx,
+                            const paddle::optional<DenseTensor>& d_dy,
+                            const paddle::optional<DenseTensor>& d_ddout,
                             bool transpose_x,
                             bool transpose_y,
                             DenseTensor* out_d_x,
@@ -76,8 +76,8 @@ void MatmulWithFlattenDoubleGradKernel(
     const DenseTensor& x,
     const DenseTensor& y,
     const DenseTensor& out_grad,
-    paddle::optional<const DenseTensor&> x_grad_grad,
-    paddle::optional<const DenseTensor&> y_grad_grad,
+    const paddle::optional<DenseTensor>& x_grad_grad,
+    const paddle::optional<DenseTensor>& y_grad_grad,
     int x_num_col_dims,
     int y_num_col_dims,
     DenseTensor* x_grad,
diff --git a/paddle/phi/kernels/momentum_kernel.h b/paddle/phi/kernels/momentum_kernel.h
index b4ba449aaf3a5..172b345af163c 100644
--- a/paddle/phi/kernels/momentum_kernel.h
+++ b/paddle/phi/kernels/momentum_kernel.h
@@ -25,7 +25,7 @@ void MomentumDenseKernel(const Context& dev_ctx,
                          const DenseTensor& grad,
                          const DenseTensor& velocity,
                          const DenseTensor& learning_rate,
-                         paddle::optional<const DenseTensor&> master_param,
+                         const paddle::optional<DenseTensor>& master_param,
                          float mu,
                          bool use_nesterov,
                          const std::string& regularization_method,
@@ -42,7 +42,7 @@ void MomentumSparseKernel(const Context& dev_ctx,
                           const SelectedRows& grad,
                           const DenseTensor& velocity,
                           const DenseTensor& learning_rate,
-                          paddle::optional<const DenseTensor&> master_param,
+                          const paddle::optional<DenseTensor>& master_param,
                           float mu,
                           bool use_nesterov,
                           const std::string& regularization_method,
diff --git a/paddle/phi/kernels/nanmedian_grad_kernel.h b/paddle/phi/kernels/nanmedian_grad_kernel.h
new file mode 100644
index 0000000000000..dc7321c1aa751
--- /dev/null
+++ b/paddle/phi/kernels/nanmedian_grad_kernel.h
@@ -0,0 +1,73 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/common/int_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PostprocessMedianGradKernel(const Context& dev_ctx,
+                                 DenseTensor* input,
+                                 const IntArray& raw_axes,
+                                 DenseTensor* x) {
+  auto input_dim = input->dims();
+  auto rank = input_dim.size();
+
+  std::vector<int64_t> axes = raw_axes.GetData();
+  int64_t axes_size = static_cast<int>(axes.size());
+  for (int64_t i = 0; i < axes_size; i++) {
+    if (axes[i] < 0) {
+      axes[i] += rank;
+    }
+  }
+
+  std::vector<int> trans_back;
+  std::vector<int> reshape_back;
+  trans_back.reserve(rank);
+  trans_back.resize(rank);
+
+  int offset = 0;
+  for (int64_t i = 0; i < rank; i++) {
+    if (std::find(axes.begin(), axes.end(), i) == axes.end()) {
+      reshape_back.push_back(input_dim[i]);
+      trans_back[i] = offset;
+      offset += 1;
+    }
+  }
+
+  for (int64_t i = 0; i < rank; i++) {
+    if (std::find(axes.begin(), axes.end(), i) != axes.end()) {
+      trans_back[i] = offset;
+      reshape_back.push_back(input_dim[i]);
+      offset += 1;
+    }
+  }
+
+  input->Resize(make_ddim(reshape_back));
+  funcs::TransCompute<Context, T>(
+      static_cast<int>(trans_back.size()), dev_ctx, *input, x, trans_back);
+}
+
+template <typename T, typename Context>
+void NanmedianGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& median_index,
+                         const DenseTensor& out_grad,
+                         const IntArray& axes,
+                         bool keep_dim,
+                         DenseTensor* x_grad);
+}  // namespace phi
diff --git a/paddle/phi/kernels/nanmedian_kernel.h b/paddle/phi/kernels/nanmedian_kernel.h
new file mode 100644
index 0000000000000..374f420381bdc
--- /dev/null
+++ b/paddle/phi/kernels/nanmedian_kernel.h
@@ -0,0 +1,75 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/common/int_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PreprocessMedianKernel(const Context& dev_ctx,
+                            const DenseTensor& input,
+                            const IntArray& raw_axes,
+                            DenseTensor* x) {
+  auto input_dim = input.dims();
+  auto rank = input_dim.size();
+  std::vector<int> perm;
+  std::vector<int64_t> reshape;
+
+  std::vector<int64_t> axes = raw_axes.GetData();
+  int64_t axes_size = static_cast<int>(axes.size());
+  for (int64_t i = 0; i < axes_size; i++) {
+    if (axes[i] < 0) {
+      axes[i] += rank;
+    }
+  }
+
+  for (int64_t i = 0; i < rank; i++) {
+    if (std::find(axes.begin(), axes.end(), i) == axes.end()) {
+      perm.push_back(i);
+      reshape.push_back(input_dim[i]);
+    }
+  }
+
+  int64_t post_numel = 1;
+  for (int64_t i = 0; i < rank; i++) {
+    if (std::find(axes.begin(), axes.end(), i) != axes.end()) {
+      perm.push_back(i);
+      post_numel *= input_dim[i];
+    }
+  }
+  reshape.push_back(post_numel);
+
+  DDim trans_dim(input_dim);
+  int ndims = perm.size();
+  for (int i = 0; i < ndims; i++) {
+    trans_dim[i] = input_dim[perm[i]];
+  }
+  x->Resize(trans_dim);
+  dev_ctx.template Alloc<T>(x);
+  funcs::TransCompute<Context, T>(ndims, dev_ctx, input, x, perm);
+
+  x->Resize(make_ddim(reshape));
+}
+
+template <typename T, typename Context>
+void NanmedianKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const IntArray& axes,
+                     bool keep_dim,
+                     DenseTensor* out,
+                     DenseTensor* medians);
+}  // namespace phi
diff --git a/paddle/phi/kernels/nll_loss_grad_kernel.h b/paddle/phi/kernels/nll_loss_grad_kernel.h
index c06f0726899ee..b682edc24df0e 100644
--- a/paddle/phi/kernels/nll_loss_grad_kernel.h
+++ b/paddle/phi/kernels/nll_loss_grad_kernel.h
@@ -22,7 +22,7 @@ template <typename T, typename Context>
 void NllLossGradKernel(const Context& dev_ctx,
                        const DenseTensor& x,
                        const DenseTensor& label,
-                       paddle::optional<const DenseTensor&> weight,
+                       const paddle::optional<DenseTensor>& weight,
                        const DenseTensor& total_weight,
                        const DenseTensor& d_out,
                        int64_t ignore_index,
diff --git a/paddle/phi/kernels/nll_loss_kernel.cc b/paddle/phi/kernels/nll_loss_kernel.cc
index b271f0f4d06a0..cf6d4d01410b9 100644
--- a/paddle/phi/kernels/nll_loss_kernel.cc
+++ b/paddle/phi/kernels/nll_loss_kernel.cc
@@ -19,7 +19,7 @@ template <typename T, typename Context>
 void NllLossKernel(const Context& dev_ctx,
                    const DenseTensor& input,
                    const DenseTensor& label,
-                   paddle::optional<const DenseTensor&> weight,
+                   const paddle::optional<DenseTensor>& weight,
                    int64_t ignore_index,
                    const std::string& reduction,
                    DenseTensor* out) {
diff --git a/paddle/phi/kernels/nll_loss_kernel.h b/paddle/phi/kernels/nll_loss_kernel.h
index 90083e1d6840d..cffaa31486025 100644
--- a/paddle/phi/kernels/nll_loss_kernel.h
+++ b/paddle/phi/kernels/nll_loss_kernel.h
@@ -24,7 +24,7 @@ template <typename T, typename Context>
 void NllLossRawKernel(const Context& dev_ctx,
                       const DenseTensor& input,
                       const DenseTensor& label,
-                      paddle::optional<const DenseTensor&> weight,
+                      const paddle::optional<DenseTensor>& weight,
                       int64_t ignore_index,
                       const std::string& reduction,
                       DenseTensor* out,
diff --git a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
index 3799b9d4892f8..1e5dfe2a542b0 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
@@ -70,6 +70,7 @@ struct BroadcastConfig {
   int strides_out[phi::DDim::kMaxRank];
   int in_dim[phi::DDim::kMaxRank];
   int dim_after_cmp[phi::DDim::kMaxRank];
+  int y_dim_after_cmp[phi::DDim::kMaxRank];
   int dim_size_after_cmp = 0;
   int cmp_res = 0;
   OptType cmp_type = OptType::CanNotOptimize;
@@ -82,7 +83,7 @@ struct BroadcastConfig {
 
   HOSTDEVICE BroadcastConfig(const std::vector<int64_t>& out_dims,
                              const std::vector<int64_t>& in_dims,
-                             const std::vector<int64_t>& another_in_dims,
+                             const std::vector<int64_t>& y_in_dims,
                              int dim_size) {
     std::vector<int> strides_in_tmp;
     std::vector<int> strides_out_tmp;
@@ -103,8 +104,8 @@ struct BroadcastConfig {
     memcpy(strides_out, strides_out_tmp.data(), kDims * sizeof(int));
     memcpy(in_dim, dim_tmp.data(), kDims * sizeof(int));
 
-    cmp_res = get_mnk_for_broadcast_ops(in_dims, another_in_dims);
-    get_opt_type(another_in_dims);
+    cmp_res = get_mnk_for_broadcast_ops(in_dims, y_in_dims);
+    get_opt_type();
     buf_len = get_buf_len();
   }
 
@@ -154,7 +155,7 @@ struct BroadcastConfig {
     return index_src;
   }
 
-  void get_opt_type(const std::vector<int64_t>& y_dim_after_cmp) {
+  void get_opt_type() {
     if (dim_size_after_cmp == 1) {
       if (dim_after_cmp[0] == 1 && y_dim_after_cmp[0] != 1) {  // {1} op {n}
         n = y_dim_after_cmp[0];
@@ -241,6 +242,7 @@ struct BroadcastConfig {
     int cmp_x = 0;
     int cmp_y = 0;
     bool is_same = false;
+
     std::vector<int64_t> xshape_after_remove_ones = xshape;
     std::vector<int64_t> yshape_after_remove_ones = yshape;
     // first step: remove excess ones
@@ -275,6 +277,7 @@ struct BroadcastConfig {
       }
       idx = idx + 1;
       dim_after_cmp[after_cmp_idx] = cmp_x;
+      y_dim_after_cmp[after_cmp_idx] = cmp_y;
       after_cmp_idx++;
       if (idx == xshape_after_remove_ones.size()) {
         dim_size_after_cmp = after_cmp_idx;
diff --git a/paddle/phi/kernels/psroi_pool_grad_kernel.h b/paddle/phi/kernels/psroi_pool_grad_kernel.h
index 87163eb8e079f..8dcf81194e269 100644
--- a/paddle/phi/kernels/psroi_pool_grad_kernel.h
+++ b/paddle/phi/kernels/psroi_pool_grad_kernel.h
@@ -23,7 +23,7 @@ template <typename T, typename Context>
 void PsroiPoolGradKernel(const Context& ctx,
                          const DenseTensor& x,
                          const DenseTensor& rois,
-                         paddle::optional<const DenseTensor&> rois_num,
+                         const paddle::optional<DenseTensor>& rois_num,
                          const DenseTensor& dout,
                          int pooled_height,
                          int pooled_width,
diff --git a/paddle/phi/kernels/psroi_pool_kernel.h b/paddle/phi/kernels/psroi_pool_kernel.h
index 341037af2caec..5838fa895119d 100644
--- a/paddle/phi/kernels/psroi_pool_kernel.h
+++ b/paddle/phi/kernels/psroi_pool_kernel.h
@@ -23,7 +23,7 @@ template <typename T, typename Context>
 void PsroiPoolKernel(const Context& ctx,
                      const DenseTensor& x,
                      const DenseTensor& rois,
-                     paddle::optional<const DenseTensor&> rois_num,
+                     const paddle::optional<DenseTensor>& rois_num,
                      int pooled_height,
                      int pooled_width,
                      int output_channels,
diff --git a/paddle/phi/kernels/rmsprop_kernel.h b/paddle/phi/kernels/rmsprop_kernel.h
index 4c3c9aa822115..fba2095cc8bce 100644
--- a/paddle/phi/kernels/rmsprop_kernel.h
+++ b/paddle/phi/kernels/rmsprop_kernel.h
@@ -26,7 +26,7 @@ void RmspropDenseKernel(const Context& dev_ctx,
                         const DenseTensor& grad,
                         const DenseTensor& moment,
                         const DenseTensor& learning_rate,
-                        paddle::optional<const DenseTensor&> mean_grad,
+                        const paddle::optional<DenseTensor>& mean_grad,
                         float epsilon,
                         float decay,
                         float momentum,
@@ -43,7 +43,7 @@ void RmspropSparseKernel(const Context& dev_ctx,
                          const SelectedRows& grad,
                          const DenseTensor& moment,
                          const DenseTensor& learning_rate,
-                         paddle::optional<const DenseTensor&> mean_grad,
+                         const paddle::optional<DenseTensor>& mean_grad,
                          float epsilon,
                          float decay,
                          float momentum,
diff --git a/paddle/phi/kernels/rnn_grad_kernel.h b/paddle/phi/kernels/rnn_grad_kernel.h
index e5b1100cf7203..024ed287bb13f 100644
--- a/paddle/phi/kernels/rnn_grad_kernel.h
+++ b/paddle/phi/kernels/rnn_grad_kernel.h
@@ -24,7 +24,7 @@ void RnnGradKernel(const Context& dev_ctx,
                    const DenseTensor& x,
                    const std::vector<const DenseTensor*>& pre_state,
                    const std::vector<const DenseTensor*>& weight_list,
-                   paddle::optional<const DenseTensor&> sequence_length,
+                   const paddle::optional<DenseTensor>& sequence_length,
                    const DenseTensor& out,
                    const DenseTensor& dropout_state,
                    const DenseTensor& reserve,
diff --git a/paddle/phi/kernels/rnn_kernel.h b/paddle/phi/kernels/rnn_kernel.h
index f1534aa598844..61dfb6f56d798 100644
--- a/paddle/phi/kernels/rnn_kernel.h
+++ b/paddle/phi/kernels/rnn_kernel.h
@@ -24,7 +24,7 @@ void RnnKernel(const Context& dev_ctx,
                const DenseTensor& x,
                const std::vector<const DenseTensor*>& pre_state,
                const std::vector<const DenseTensor*>& weight_list,
-               paddle::optional<const DenseTensor&> sequence_length,
+               const paddle::optional<DenseTensor>& sequence_length,
                float dropout_prob,
                bool is_bidirec,
                int input_size,
diff --git a/paddle/phi/kernels/roi_align_grad_kernel.h b/paddle/phi/kernels/roi_align_grad_kernel.h
index eea1fa03886a4..a7c2ed3beb53a 100644
--- a/paddle/phi/kernels/roi_align_grad_kernel.h
+++ b/paddle/phi/kernels/roi_align_grad_kernel.h
@@ -23,7 +23,7 @@ template <typename T, typename Context>
 void RoiAlignGradKernel(const Context& dev_ctx,
                         const DenseTensor& x,
                         const DenseTensor& boxes,
-                        paddle::optional<const DenseTensor&> boxes_num,
+                        const paddle::optional<DenseTensor>& boxes_num,
                         const DenseTensor& out_grad,
                         int pooled_height,
                         int pooled_width,
diff --git a/paddle/phi/kernels/roi_align_kernel.h b/paddle/phi/kernels/roi_align_kernel.h
index 9734da53b7f45..fa3161e3238df 100644
--- a/paddle/phi/kernels/roi_align_kernel.h
+++ b/paddle/phi/kernels/roi_align_kernel.h
@@ -23,7 +23,7 @@ template <typename T, typename Context>
 void RoiAlignKernel(const Context& dev_ctx,
                     const DenseTensor& x,
                     const DenseTensor& boxes,
-                    paddle::optional<const DenseTensor&> boxes_num,
+                    const paddle::optional<DenseTensor>& boxes_num,
                     int pooled_height,
                     int pooled_width,
                     float spatial_scale,
diff --git a/paddle/phi/kernels/roi_pool_grad_kernel.h b/paddle/phi/kernels/roi_pool_grad_kernel.h
index d7f1c378f75c3..f18bd1d65e644 100644
--- a/paddle/phi/kernels/roi_pool_grad_kernel.h
+++ b/paddle/phi/kernels/roi_pool_grad_kernel.h
@@ -23,7 +23,7 @@ template <typename T, typename Context>
 void RoiPooGradKernel(const Context& dev_ctx,
                       const DenseTensor& x,
                       const DenseTensor& boxes,
-                      paddle::optional<const DenseTensor&> boxes_num,
+                      const paddle::optional<DenseTensor>& boxes_num,
                       const DenseTensor& arg_max,
                       const DenseTensor& out_grad,
                       int pooled_height,
diff --git a/paddle/phi/kernels/roi_pool_kernel.h b/paddle/phi/kernels/roi_pool_kernel.h
index c6ff6f223612a..e7ed2587968f5 100644
--- a/paddle/phi/kernels/roi_pool_kernel.h
+++ b/paddle/phi/kernels/roi_pool_kernel.h
@@ -25,7 +25,7 @@ template <typename T, typename Context>
 void RoiPoolKernel(const Context& dev_ctx,
                    const DenseTensor& x,
                    const DenseTensor& boxes,
-                   paddle::optional<const DenseTensor&> boxes_num,
+                   const paddle::optional<DenseTensor>& boxes_num,
                    int pooled_height,
                    int pooled_width,
                    float spatial_scale,
diff --git a/paddle/phi/kernels/rrelu_grad_kernel.h b/paddle/phi/kernels/rrelu_grad_kernel.h
new file mode 100644
index 0000000000000..b6172fca10e53
--- /dev/null
+++ b/paddle/phi/kernels/rrelu_grad_kernel.h
@@ -0,0 +1,28 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RReluGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& noise,
+                     const DenseTensor& out_grad,
+                     DenseTensor* x_grad);
+}  // namespace phi
diff --git a/paddle/phi/kernels/rrelu_kernel.h b/paddle/phi/kernels/rrelu_kernel.h
new file mode 100644
index 0000000000000..8deb52daaae13
--- /dev/null
+++ b/paddle/phi/kernels/rrelu_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RReluKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const float lower,
+                 const float upper,
+                 bool is_test,
+                 DenseTensor* out,
+                 DenseTensor* noise);
+}  // namespace phi
diff --git a/paddle/phi/kernels/segment_pool_grad_kernel.h b/paddle/phi/kernels/segment_pool_grad_kernel.h
index e773eed16e8c8..edf9ff9c7568c 100644
--- a/paddle/phi/kernels/segment_pool_grad_kernel.h
+++ b/paddle/phi/kernels/segment_pool_grad_kernel.h
@@ -23,7 +23,7 @@ void SegmentPoolGradKernel(const Context& dev_ctx,
                            const DenseTensor& x,
                            const DenseTensor& segment_ids,
                            const DenseTensor& out,
-                           paddle::optional<const DenseTensor&> summed_ids,
+                           const paddle::optional<DenseTensor>& summed_ids,
                            const DenseTensor& out_grad,
                            const std::string& pooltype,
                            DenseTensor* x_grad);
diff --git a/paddle/phi/kernels/selected_rows/adam_kernel.h b/paddle/phi/kernels/selected_rows/adam_kernel.h
index 2e13d29d17284..79f87a8ed75c0 100644
--- a/paddle/phi/kernels/selected_rows/adam_kernel.h
+++ b/paddle/phi/kernels/selected_rows/adam_kernel.h
@@ -31,8 +31,8 @@ void AdamDenseParamSparseGradKernel(
     const DenseTensor& moment2,
     const DenseTensor& beta1_pow,
     const DenseTensor& beta2_pow,
-    paddle::optional<const DenseTensor&> master_param,
-    paddle::optional<const DenseTensor&> skip_update,
+    const paddle::optional<DenseTensor>& master_param,
+    const paddle::optional<DenseTensor>& skip_update,
     const Scalar& beta1,
     const Scalar& beta2,
     const Scalar& epsilon,
diff --git a/paddle/phi/kernels/selected_rows/adamw_kernel.h b/paddle/phi/kernels/selected_rows/adamw_kernel.h
index ddb155ce4504e..5dda8107d52e3 100644
--- a/paddle/phi/kernels/selected_rows/adamw_kernel.h
+++ b/paddle/phi/kernels/selected_rows/adamw_kernel.h
@@ -31,8 +31,8 @@ void AdamwDenseParamSparseGradKernel(
     const DenseTensor& moment2,
     const DenseTensor& beta1_pow,
     const DenseTensor& beta2_pow,
-    paddle::optional<const DenseTensor&> master_param,
-    paddle::optional<const DenseTensor&> skip_update,
+    const paddle::optional<DenseTensor>& master_param,
+    const paddle::optional<DenseTensor>& skip_update,
     const Scalar& beta1,
     const Scalar& beta2,
     const Scalar& epsilon,
diff --git a/paddle/phi/kernels/selected_rows/assign_kernel.cc b/paddle/phi/kernels/selected_rows/assign_kernel.cc
index fae876facfc8f..f0c0ffb591a11 100644
--- a/paddle/phi/kernels/selected_rows/assign_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/assign_kernel.cc
@@ -20,7 +20,7 @@
 namespace phi {
 namespace sr {
 
-// Note: use `const paddle::optional<const SelectedRows&> x`
+// Note: use `const paddle::optional<SelectedRows>& x`
 // as input if needed
 template <typename Context>
 void AssignKernel(const Context& dev_ctx,
diff --git a/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc b/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc
index 57e33beb95e3e..d96c707538e41 100644
--- a/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc
@@ -35,8 +35,8 @@ void AdamDenseParamSparseGradKernel(
     const DenseTensor& moment2,
     const DenseTensor& beta1_pow,
     const DenseTensor& beta2_pow,
-    paddle::optional<const DenseTensor&> master_param,
-    paddle::optional<const DenseTensor&> skip_update,
+    const paddle::optional<DenseTensor>& master_param,
+    const paddle::optional<DenseTensor>& skip_update,
     const Scalar& beta1,
     const Scalar& beta2,
     const Scalar& epsilon,
diff --git a/paddle/phi/kernels/selected_rows/cpu/adamw_kernel.cc b/paddle/phi/kernels/selected_rows/cpu/adamw_kernel.cc
index a52bca761108c..6d2fc164d6b33 100644
--- a/paddle/phi/kernels/selected_rows/cpu/adamw_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/cpu/adamw_kernel.cc
@@ -35,8 +35,8 @@ void AdamwDenseParamSparseGradKernel(
     const DenseTensor& moment2,
     const DenseTensor& beta1_pow,
     const DenseTensor& beta2_pow,
-    paddle::optional<const DenseTensor&> master_param,
-    paddle::optional<const DenseTensor&> skip_update,
+    const paddle::optional<DenseTensor>& master_param,
+    const paddle::optional<DenseTensor>& skip_update,
     const Scalar& beta1,
     const Scalar& beta2,
     const Scalar& epsilon,
diff --git a/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu b/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu
index 31abac149951d..18b6da818a1f3 100644
--- a/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu
+++ b/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu
@@ -102,8 +102,8 @@ void AdamDenseParamSparseGradKernel(
     const DenseTensor& moment2,
     const DenseTensor& beta1_pow,
     const DenseTensor& beta2_pow,
-    paddle::optional<const DenseTensor&> master_param,
-    paddle::optional<const DenseTensor&> skip_update,
+    const paddle::optional<DenseTensor>& master_param,
+    const paddle::optional<DenseTensor>& skip_update,
     const Scalar& beta1,
     const Scalar& beta2,
     const Scalar& epsilon,
diff --git a/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu b/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
index b847f48d12267..182c4390b1722 100644
--- a/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
+++ b/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
@@ -112,8 +112,8 @@ void AdamwDenseParamSparseGradKernel(
     const DenseTensor& moment2,
     const DenseTensor& beta1_pow,
     const DenseTensor& beta2_pow,
-    paddle::optional<const DenseTensor&> master_param,
-    paddle::optional<const DenseTensor&> skip_update,
+    const paddle::optional<DenseTensor>& master_param,
+    const paddle::optional<DenseTensor>& skip_update,
     const Scalar& beta1,
     const Scalar& beta2,
     const Scalar& epsilon,
diff --git a/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.cc b/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.cc
index 1660601bbd36e..616786d210df7 100644
--- a/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.cc
@@ -40,9 +40,9 @@ void HierarchicalSigmoidGradKernel(const Context& ctx,
                                    const DenseTensor& x,
                                    const DenseTensor& w,
                                    const DenseTensor& label,
-                                   paddle::optional<const DenseTensor&> path,
-                                   paddle::optional<const DenseTensor&> code,
-                                   paddle::optional<const DenseTensor&> bias,
+                                   const paddle::optional<DenseTensor>& path,
+                                   const paddle::optional<DenseTensor>& code,
+                                   const paddle::optional<DenseTensor>& bias,
                                    const DenseTensor& pre_out,
                                    const DenseTensor& out_grad,
                                    int num_classes,
diff --git a/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h b/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h
index 4c03b83d80fff..aca355f515c44 100644
--- a/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h
+++ b/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h
@@ -25,9 +25,9 @@ void HierarchicalSigmoidGradKernel(const Context& ctx,
                                    const DenseTensor& x,
                                    const DenseTensor& w,
                                    const DenseTensor& label,
-                                   paddle::optional<const DenseTensor&> path,
-                                   paddle::optional<const DenseTensor&> code,
-                                   paddle::optional<const DenseTensor&> bias,
+                                   const paddle::optional<DenseTensor>& path,
+                                   const paddle::optional<DenseTensor>& code,
+                                   const paddle::optional<DenseTensor>& bias,
                                    const DenseTensor& pre_out,
                                    const DenseTensor& out_grad,
                                    int num_classes,
diff --git a/paddle/phi/kernels/sgd_kernel.h b/paddle/phi/kernels/sgd_kernel.h
index 12361c738e247..226a719b90244 100644
--- a/paddle/phi/kernels/sgd_kernel.h
+++ b/paddle/phi/kernels/sgd_kernel.h
@@ -24,7 +24,7 @@ void SGDDenseKernel(const Context& dev_ctx,
                     const DenseTensor& param,
                     const DenseTensor& learning_rate,
                     const DenseTensor& grad,
-                    paddle::optional<const DenseTensor&> master_param,
+                    const paddle::optional<DenseTensor>& master_param,
                     bool multi_precision,
                     DenseTensor* param_out,
                     DenseTensor* master_param_out);
@@ -35,7 +35,7 @@ void SGDDenseParamSparseGradKernel(
     const DenseTensor& param,
     const DenseTensor& learning_rate,
     const SelectedRows& grad,
-    paddle::optional<const DenseTensor&> master_param,
+    const paddle::optional<DenseTensor>& master_param,
     bool multi_precision,
     DenseTensor* param_out,
     DenseTensor* master_param_out);
@@ -46,7 +46,7 @@ void SGDSparseParamSparseGradKernel(
     const SelectedRows& param,
     const DenseTensor& learning_rate,
     const SelectedRows& grad,
-    paddle::optional<const SelectedRows&> master_param,
+    const paddle::optional<SelectedRows>& master_param,
     bool multi_precision,
     SelectedRows* param_out,
     SelectedRows* master_param_out);
diff --git a/paddle/phi/kernels/sparse/convolution_kernel.h b/paddle/phi/kernels/sparse/convolution_kernel.h
index 6120d6339a7eb..62a72a9dd4115 100644
--- a/paddle/phi/kernels/sparse/convolution_kernel.h
+++ b/paddle/phi/kernels/sparse/convolution_kernel.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/kernels/empty_kernel.h"
diff --git a/paddle/phi/kernels/sparse/copy_kernel.h b/paddle/phi/kernels/sparse/copy_kernel.h
index a43621a4dfeed..70e2aaef8a888 100644
--- a/paddle/phi/kernels/sparse/copy_kernel.h
+++ b/paddle/phi/kernels/sparse/copy_kernel.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
index ff2647de731d7..b208e70e04046 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
@@ -22,6 +22,8 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/sparse/common_shape.h"
 #include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
 
+#include "paddle/fluid/platform/enforce.h"
+
 namespace phi {
 namespace sparse {
 
diff --git a/paddle/phi/kernels/sparse/sparse_utils_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
index d39790fcea5e3..93abf70b24412 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_kernel.h
+++ b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
diff --git a/paddle/phi/kernels/strings/cpu/strings_copy_kernel.cc b/paddle/phi/kernels/strings/cpu/strings_copy_kernel.cc
index 156cea63f171c..41889f9cc5ed7 100644
--- a/paddle/phi/kernels/strings/cpu/strings_copy_kernel.cc
+++ b/paddle/phi/kernels/strings/cpu/strings_copy_kernel.cc
@@ -15,6 +15,8 @@ limitations under the License. */
 #include "paddle/phi/kernels/strings/strings_copy_kernel.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+#include "glog/logging.h"
+
 namespace phi {
 namespace strings {
 
diff --git a/paddle/phi/kernels/strings/strings_empty_kernel.h b/paddle/phi/kernels/strings/strings_empty_kernel.h
index 1add1963614d8..8a014f2a78c2c 100644
--- a/paddle/phi/kernels/strings/strings_empty_kernel.h
+++ b/paddle/phi/kernels/strings/strings_empty_kernel.h
@@ -15,7 +15,6 @@
 #pragma once
 
 #include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/core/string_tensor.h"
 #include "paddle/phi/infermeta/strings/nullary.h"
diff --git a/paddle/phi/kernels/strings/strings_lower_upper_kernel.h b/paddle/phi/kernels/strings/strings_lower_upper_kernel.h
index 97f530164528a..db6c267a8586d 100644
--- a/paddle/phi/kernels/strings/strings_lower_upper_kernel.h
+++ b/paddle/phi/kernels/strings/strings_lower_upper_kernel.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/string_tensor.h"
 #include "paddle/phi/infermeta/strings/unary.h"
 #include "paddle/phi/kernels/strings/case_utils.h"
diff --git a/paddle/phi/kernels/warpctc_grad_kernel.h b/paddle/phi/kernels/warpctc_grad_kernel.h
index 8e1ab43324a50..8a8251aabe468 100644
--- a/paddle/phi/kernels/warpctc_grad_kernel.h
+++ b/paddle/phi/kernels/warpctc_grad_kernel.h
@@ -24,7 +24,7 @@ void WarpctcGradKernel(const Context& dev_ctx,
                        const DenseTensor& warpctc_grad,
                        const DenseTensor& logits,
                        const DenseTensor& loss_grad,
-                       paddle::optional<const DenseTensor&> logits_length,
+                       const paddle::optional<DenseTensor>& logits_length,
                        int blank,
                        bool norm_by_times,
                        DenseTensor* logits_grad);
diff --git a/paddle/phi/kernels/warpctc_kernel.h b/paddle/phi/kernels/warpctc_kernel.h
index 4baa49064775e..0b9e9eb87f675 100644
--- a/paddle/phi/kernels/warpctc_kernel.h
+++ b/paddle/phi/kernels/warpctc_kernel.h
@@ -23,8 +23,8 @@ template <typename T, typename Context>
 void WarpctcKernel(const Context& dev_ctx,
                    const DenseTensor& logits,
                    const DenseTensor& label,
-                   paddle::optional<const DenseTensor&> logits_length,
-                   paddle::optional<const DenseTensor&> labels_length,
+                   const paddle::optional<DenseTensor>& logits_length,
+                   const paddle::optional<DenseTensor>& labels_length,
                    int blank,
                    bool norm_by_times,
                    DenseTensor* warpctc_grad,
diff --git a/paddle/phi/kernels/yolov3_loss_grad_kernel.h b/paddle/phi/kernels/yolov3_loss_grad_kernel.h
index 789e782443f68..b4ce5b9539813 100644
--- a/paddle/phi/kernels/yolov3_loss_grad_kernel.h
+++ b/paddle/phi/kernels/yolov3_loss_grad_kernel.h
@@ -23,7 +23,7 @@ void Yolov3LossGradKernel(const Context& dev_ctx,
                           const DenseTensor& x,
                           const DenseTensor& gt_box,
                           const DenseTensor& gt_label,
-                          paddle::optional<const DenseTensor&> gt_score,
+                          const paddle::optional<DenseTensor>& gt_score,
                           const DenseTensor& loss_grad,
                           const DenseTensor& objectness_mask,
                           const DenseTensor& gt_match_mask,
diff --git a/paddle/phi/kernels/yolov3_loss_kernel.h b/paddle/phi/kernels/yolov3_loss_kernel.h
index eb6668000dee0..3dabe5ce820ee 100644
--- a/paddle/phi/kernels/yolov3_loss_kernel.h
+++ b/paddle/phi/kernels/yolov3_loss_kernel.h
@@ -23,7 +23,7 @@ void Yolov3LossKernel(const Context& dev_ctx,
                       const DenseTensor& x,
                       const DenseTensor& gt_box,
                       const DenseTensor& gt_label,
-                      paddle::optional<const DenseTensor&> gt_score,
+                      const paddle::optional<DenseTensor>& gt_score,
                       const std::vector<int>& anchors,
                       const std::vector<int>& anchor_mask,
                       int class_num,
diff --git a/paddle/phi/ops/compat/conv3d_sig.cc b/paddle/phi/ops/compat/conv3d_sig.cc
index c6aae1bf5bb54..49f31288d00f6 100644
--- a/paddle/phi/ops/compat/conv3d_sig.cc
+++ b/paddle/phi/ops/compat/conv3d_sig.cc
@@ -49,7 +49,7 @@ KernelSignature Conv3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
 KernelSignature Conv3dDoubleGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature("conv3d_grad_grad",
-                         {"DDInput", "DDFilter", "DOutput", "Input", "Filter"},
+                         {"Input", "Filter", "DOutput", "DDInput", "DDFilter"},
                          {"strides",
                           "paddings",
                           "padding_algorithm",
@@ -59,7 +59,7 @@ KernelSignature Conv3dDoubleGradOpArgumentMapping(
                           "use_addto",
                           "workspace_size_MB",
                           "exhaustive_search"},
-                         {"DDOutput", "DInput", "DFilter"});
+                         {"DInput", "DFilter", "DDOutput"});
 }
 
 }  // namespace phi
diff --git a/paddle/phi/ops/compat/einsum_sig.cc b/paddle/phi/ops/compat/einsum_sig.cc
index 0b3cc3425df45..5e45bcf97ce0e 100644
--- a/paddle/phi/ops/compat/einsum_sig.cc
+++ b/paddle/phi/ops/compat/einsum_sig.cc
@@ -17,14 +17,15 @@ limitations under the License. */
 namespace phi {
 
 KernelSignature EinsumOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature("einsum", {"Operands"}, {"equation"}, {"Out"});
+  return KernelSignature(
+      "einsum", {"Operands"}, {"equation"}, {"Out", "InnerCache"});
 }
 
 KernelSignature EinsumGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
   return KernelSignature("einsum_grad",
-                         {"Operands", {"Out@GRAD"}},
+                         {"Operands", "InnerCache", "Out@GRAD"},
                          {"equation"},
-                         {{"Operands@GRAD"}});
+                         {"Operands@GRAD"});
 }
 }  // namespace phi
 
diff --git a/paddle/phi/ops/compat/elementwise_sig.cc b/paddle/phi/ops/compat/elementwise_sig.cc
index c760c966b0647..17fb1858373d9 100644
--- a/paddle/phi/ops/compat/elementwise_sig.cc
+++ b/paddle/phi/ops/compat/elementwise_sig.cc
@@ -25,6 +25,11 @@ KernelSignature ElementwiseAddOpArgumentMapping(
   return KernelSignature("add_raw", {"X", "Y"}, {"axis"}, {"Out"});
 }
 
+KernelSignature ElementwiseGradAddOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("grad_add", {"X", "Y"}, {}, {"Out"});
+}
+
 KernelSignature ElementwiseSubOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   int axis = paddle::any_cast<int>(ctx.Attr("axis"));
@@ -317,3 +322,4 @@ PD_REGISTER_ARG_MAPPING_FN(elementwise_heaviside_grad,
                            phi::ElementwiseHeavisideGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(elementwise_pow_grad,
                            phi::ElementwisePowGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(grad_add, phi::ElementwiseGradAddOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/instance_norm_sig.cc b/paddle/phi/ops/compat/instance_norm_sig.cc
new file mode 100644
index 0000000000000..2b490078512b1
--- /dev/null
+++ b/paddle/phi/ops/compat/instance_norm_sig.cc
@@ -0,0 +1,56 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature InstanceNormOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("instance_norm",
+                         {"X", "Scale", "Bias"},
+                         {"epsilon"},
+                         {"Y", "SavedMean", "SavedVariance"});
+}
+
+KernelSignature InstanceNormGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("instance_norm_grad",
+                         {"X", "Y@GRAD", "Scale", "SavedMean", "SavedVariance"},
+                         {"epsilon"},
+                         {"X@GRAD", "Scale@GRAD", "Bias@GRAD"});
+}
+KernelSignature InstanceNormDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("instance_norm_double_grad",
+                         {"X",
+                          "Scale",
+                          "SavedMean",
+                          "SavedVariance",
+                          "DY",
+                          "DDX",
+                          "DDScale",
+                          "DDBias"},
+                         {"epsilon"},
+                         {"DX", "DScale", "DDY"});
+}
+}  // namespace phi
+
+PD_REGISTER_BASE_KERNEL_NAME(instance_norm_grad_grad,
+                             instance_norm_double_grad);
+PD_REGISTER_ARG_MAPPING_FN(instance_norm, phi::InstanceNormOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(instance_norm_grad,
+                           phi::InstanceNormGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(instance_norm_grad_grad,
+                           phi::InstanceNormDoubleGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/nanmedian_sig.cc b/paddle/phi/ops/compat/nanmedian_sig.cc
new file mode 100644
index 0000000000000..5ca0d450e3b41
--- /dev/null
+++ b/paddle/phi/ops/compat/nanmedian_sig.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature NanmedianOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "nanmedian", {"X"}, {"axis", "keepdim"}, {"Out", "MedianIndex"});
+}
+
+KernelSignature NanmedianGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("nanmedian_grad",
+                         {"X", "MedianIndex", "Out@GRAD"},
+                         {"axis", "keepdim"},
+                         {"X@GRAD"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(nanmedian, phi::NanmedianOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(nanmedian_grad, phi::NanmedianGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/rrelu_sig.cc b/paddle/phi/ops/compat/rrelu_sig.cc
new file mode 100644
index 0000000000000..00cd705a24076
--- /dev/null
+++ b/paddle/phi/ops/compat/rrelu_sig.cc
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature RReluOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "rrelu", {"X"}, {"lower", "upper", "is_test"}, {"Out", "Noise"});
+}
+
+KernelSignature RReluGradGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "rrelu_grad", {"X", "Noise", "Out@GRAD"}, {}, {"X@GRAD"});
+}
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(rrelu, phi::RReluOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(rrelu_grad, phi::RReluGradGradOpArgumentMapping);
diff --git a/paddle/phi/tests/api/CMakeLists.txt b/paddle/phi/tests/api/CMakeLists.txt
index 5c1d0989629dc..2333f82d626c4 100644
--- a/paddle/phi/tests/api/CMakeLists.txt
+++ b/paddle/phi/tests/api/CMakeLists.txt
@@ -15,6 +15,7 @@ cc_test(test_matmul_api SRCS test_matmul_api.cc DEPS ${COMMON_API_TEST_DEPS})
 cc_test(test_empty_api SRCS test_empty_api.cc DEPS ${COMMON_API_TEST_DEPS})
 cc_test(test_fill_api SRCS test_fill_api.cc DEPS ${COMMON_API_TEST_DEPS} api_scalar)
 cc_test(test_elementwise_api SRCS test_elementwise_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_embedding_api SRCS test_embedding_api.cc DEPS ${COMMON_API_TEST_DEPS})
 cc_test(test_cast_api SRCS test_cast_api.cc DEPS ${COMMON_API_TEST_DEPS})
 cc_test(test_reshape_api SRCS test_reshape_api.cc DEPS ${COMMON_API_TEST_DEPS})
 cc_test(test_to_api SRCS test_to_api.cc DEPS ${COMMON_API_TEST_DEPS})
diff --git a/paddle/phi/tests/api/scale_api.h b/paddle/phi/tests/api/scale_api.h
index 0217ba23b2274..16143fb11e0ff 100644
--- a/paddle/phi/tests/api/scale_api.h
+++ b/paddle/phi/tests/api/scale_api.h
@@ -19,7 +19,6 @@
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/api/test_embedding_api.cc b/paddle/phi/tests/api/test_embedding_api.cc
new file mode 100644
index 0000000000000..6ccd382786bd1
--- /dev/null
+++ b/paddle/phi/tests/api/test_embedding_api.cc
@@ -0,0 +1,119 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/phi/api/backward/backward_api.h"
+#include "paddle/phi/api/include/api.h"
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(sparse_weight_embedding, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sparse_weight_embedding_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sparse_weight_embedding_sparse_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(empty, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
+namespace paddle {
+namespace tests {
+
+TEST(API, sparse_weight_embedding) {
+  auto x = paddle::experimental::empty({4}, DataType::INT32);
+  auto* x_data = x.data<int32_t>();
+  x_data[0] = 0;
+  x_data[1] = 4;
+  x_data[2] = 3;
+  x_data[3] = 1;
+
+  auto weight_sr = std::make_shared<phi::SelectedRows>(
+      std::vector<int64_t>{0, 1, 2, 3, 4, 5, 6}, 16);
+  *weight_sr->mutable_value() = *static_cast<phi::DenseTensor*>(
+      paddle::experimental::full({7, 3}, 2, DataType::FLOAT32).impl().get());
+  paddle::experimental::Tensor weight;
+  weight.set_impl(weight_sr);
+
+  auto out = paddle::experimental::embedding(x, weight);
+
+  // 3. check result
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 4);
+  ASSERT_EQ(out.numel(), 12);
+  ASSERT_EQ(out.type(), phi::DataType::FLOAT32);
+  ASSERT_EQ(out.layout(), phi::DataLayout::NCHW);
+}
+
+TEST(API, sparse_weight_embedding_grad) {
+  auto x = paddle::experimental::empty({4}, DataType::INT32);
+  auto* x_data = x.data<int32_t>();
+  x_data[0] = 0;
+  x_data[1] = 4;
+  x_data[2] = 3;
+  x_data[3] = 1;
+
+  auto weight_sr = std::make_shared<phi::SelectedRows>(
+      std::vector<int64_t>{0, 1, 2, 3, 4, 5, 6}, 16);
+  *weight_sr->mutable_value() = *static_cast<phi::DenseTensor*>(
+      paddle::experimental::full({7, 3}, 2, DataType::FLOAT32).impl().get());
+  paddle::experimental::Tensor weight;
+  weight.set_impl(weight_sr);
+
+  auto out_grad = paddle::experimental::full({4, 3}, 1, DataType::FLOAT32);
+
+  paddle::experimental::Tensor weight_grad;
+
+  paddle::experimental::embedding_grad(
+      x, weight, out_grad, -1, false, &weight_grad);
+
+  // 3. check result
+  ASSERT_EQ(weight_grad.dims().size(), 2);
+  ASSERT_EQ(weight_grad.dims()[0], 16);
+  ASSERT_EQ(weight_grad.numel(), 48);
+  ASSERT_EQ(weight_grad.type(), phi::DataType::FLOAT32);
+  ASSERT_EQ(weight_grad.layout(), phi::DataLayout::NCHW);
+}
+
+TEST(API, sparse_weight_embedding_sparse_grad) {
+  auto x = paddle::experimental::empty({4}, DataType::INT32);
+  auto* x_data = x.data<int32_t>();
+  x_data[0] = 0;
+  x_data[1] = 4;
+  x_data[2] = 3;
+  x_data[3] = 1;
+
+  auto weight_sr = std::make_shared<phi::SelectedRows>(
+      std::vector<int64_t>{0, 1, 2, 3, 4, 5, 6}, 16);
+  *weight_sr->mutable_value() = *static_cast<phi::DenseTensor*>(
+      paddle::experimental::full({7, 3}, 2, DataType::FLOAT32).impl().get());
+  paddle::experimental::Tensor weight;
+  weight.set_impl(weight_sr);
+
+  auto out_grad = paddle::experimental::full({4, 3}, 1, DataType::FLOAT32);
+
+  paddle::experimental::Tensor weight_grad;
+
+  paddle::experimental::embedding_grad(
+      x, weight, out_grad, -1, true, &weight_grad);
+
+  // 3. check result
+  ASSERT_EQ(weight_grad.dims().size(), 2);
+  ASSERT_EQ(weight_grad.dims()[0], 4);
+  ASSERT_EQ(weight_grad.numel(), 12);
+  ASSERT_EQ(weight_grad.type(), phi::DataType::FLOAT32);
+  ASSERT_EQ(weight_grad.layout(), phi::DataLayout::NCHW);
+}
+
+}  // namespace tests
+}  // namespace paddle
diff --git a/paddle/phi/tests/common/test_int_array.cc b/paddle/phi/tests/common/test_int_array.cc
index b6c4f2b1ea8e3..a6278ee4a34fc 100644
--- a/paddle/phi/tests/common/test_int_array.cc
+++ b/paddle/phi/tests/common/test_int_array.cc
@@ -25,8 +25,10 @@ limitations under the License. */
 #include "gtest/gtest.h"
 
 PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
 #endif
 
 namespace phi {
diff --git a/paddle/phi/tests/core/test_custom_kernel.cc b/paddle/phi/tests/core/test_custom_kernel.cc
index 634edaec96d29..abd77e2862410 100644
--- a/paddle/phi/tests/core/test_custom_kernel.cc
+++ b/paddle/phi/tests/core/test_custom_kernel.cc
@@ -23,7 +23,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/kernel_context.h"
diff --git a/paddle/phi/tests/core/test_dense_tensor.cc b/paddle/phi/tests/core/test_dense_tensor.cc
index ddfa184df2c1e..42814317b9c83 100644
--- a/paddle/phi/tests/core/test_dense_tensor.cc
+++ b/paddle/phi/tests/core/test_dense_tensor.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "gtest/gtest.h"
 
+#include "glog/logging.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/tests/core/allocator.h"
 
diff --git a/paddle/phi/tests/core/test_sparse_coo_tensor.cc b/paddle/phi/tests/core/test_sparse_coo_tensor.cc
index 5d0e16b0528e7..5e7642bbfdcb0 100644
--- a/paddle/phi/tests/core/test_sparse_coo_tensor.cc
+++ b/paddle/phi/tests/core/test_sparse_coo_tensor.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "gtest/gtest.h"
 
+#include "glog/logging.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/tests/core/allocator.h"
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 0f70f9a8f3564..2a18d2f7e0195 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -85,6 +85,9 @@ if not defined NEW_RELEASE_JIT set NEW_RELEASE_JIT=OFF
 set task_name=%1
 set UPLOAD_TP_FILE=OFF
 
+set error_code=0
+type %cache_dir%\error_code.txt
+
 rem ------initialize set git config------
 git config --global core.longpaths true
 
@@ -118,8 +121,6 @@ if "%WITH_CACHE%"=="OFF" (
     goto :mkbuild
 )
 
-set error_code=0
-type %cache_dir%\error_code.txt
 : set /p error_code=< %cache_dir%\error_code.txt
 if %error_code% NEQ 0 (
     rmdir %BUILD_DIR% /s/q
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 2eda74b769c04..efd2de5621604 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -2986,20 +2986,11 @@ function build_develop() {
 }
 
 function check_coverage_build() {
-    if [ ! "${buildSize}" ];then
-        echo "build size not found"
-        exit 1
-    fi
-
-    if [ ${WITH_COVERAGE} != "ON" ];then
-        echo "WARNING: check_coverage need to compile with WITH_COVERAGE=ON, but got WITH_COVERAGE=OFF"
-        exit 1
-    fi
-
     rm -f build_size
     curl -O https://paddle-docker-tar.bj.bcebos.com/paddle_ci_index/build_size
+    curl -O https://xly-devops.bj.bcebos.com/PR/build_whl/${AGILE_PULL_ID}/${AGILE_REVISION}/coverage_build_size
     dev_coverage_build_size=`cat build_size|sed 's#G##g'`
-    pr_coverage_build_size=`echo $buildSize|sed 's#G##g'`
+    pr_coverage_build_size=`cat coverage_build_size|sed 's#G##g'`
 
     diff_coverage_build_size=`echo $(($pr_coverage_build_size - $dev_coverage_build_size))`
 
@@ -3149,7 +3140,6 @@ function main() {
         check_diff_file_for_coverage
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
         enable_unused_var_check
-        check_coverage_build
         ;;
       gpu_cicheck_coverage)
         check_approvals_of_unittest 1
@@ -3157,6 +3147,9 @@ function main() {
         check_coverage
         check_change_of_unittest ${PYTHON_ABI:-""}
         ;;
+      check_coverage_build)
+        check_coverage_build
+        ;;
       ci_preciseTest)
         insert_pile_to_h_cu_diff 
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt
index 64c88a47b4393..7669c06b2c2b7 100644
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
@@ -1,3 +1,5 @@
 add_subdirectory(string)
-cc_test(small_vector_test SRCS small_vector_test.cc DEPS gtest gflags)
+
 cc_test(array_ref_test SRCS array_ref_test.cc DEPS gtest gflags)
+cc_test(small_vector_test SRCS small_vector_test.cc DEPS gtest gflags)
+cc_test(variant_test SRCS variant_test.cc DEPS gtest)
diff --git a/paddle/utils/optional.h b/paddle/utils/optional.h
index eec5f32be7226..2b5a657f4d42e 100644
--- a/paddle/utils/optional.h
+++ b/paddle/utils/optional.h
@@ -100,7 +100,11 @@ class reference_content {
  public:  // structors
   ~reference_content() {}
 
+// TODO(zhiqiu): remove it
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
   reference_content(RefT r) : content_(r) {}
+#pragma GCC diagnostic pop
 
   reference_content(const reference_content& operand)
       : content_(operand.content_) {}
diff --git a/paddle/utils/variant.h b/paddle/utils/variant.h
index 50bdc4287e21a..4348abc9cbff0 100644
--- a/paddle/utils/variant.h
+++ b/paddle/utils/variant.h
@@ -2199,6 +2199,18 @@ class impl : public copy_assignment<traits<Ts...>> {
     }
   }
 
+  inline const std::type_info &type() const {
+    return visitation::alt::visit_alt_at(
+        this->index(),
+#ifdef MPARK_GENERIC_LAMBDAS
+        [](auto &alt) -> const std::type_info & { return typeid(alt.value); }
+#else
+        typer {}
+#endif
+        ,
+        *this);
+  }
+
  private:
 #ifndef MPARK_GENERIC_LAMBDAS
   struct swapper {
@@ -2208,6 +2220,13 @@ class impl : public copy_assignment<traits<Ts...>> {
       swap(this_alt.value, that_alt.value);
     }
   };
+
+  struct typer {
+    template <typename Alt>
+    inline const std::type_info &operator()(Alt &alt) const {
+      return typeid(alt.value);
+    }
+  };
 #endif
 
   inline constexpr bool move_nothrow() const {
@@ -2432,6 +2451,8 @@ class variant {
     impl_.swap(that.impl_);
   }
 
+  inline const std::type_info &type() noexcept { return impl_.type(); }
+
  private:
   detail::impl<Ts...> impl_;
 
diff --git a/paddle/utils/variant_test.cc b/paddle/utils/variant_test.cc
new file mode 100644
index 0000000000000..e690269d801c1
--- /dev/null
+++ b/paddle/utils/variant_test.cc
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/utils/variant.h"
+#include "gtest/gtest.h"
+#include "paddle/phi/core/enforce.h"
+
+TEST(interface_test, type) {
+  using phi::enforce::demangle;
+
+  paddle::variant<bool, int, float> var;
+
+  var = true;
+  EXPECT_EQ(demangle(var.type().name()), "bool");
+
+  var = 0;
+  EXPECT_EQ(demangle(var.type().name()), "int");
+
+  var = 0.f;
+  EXPECT_EQ(demangle(var.type().name()), "float");
+}
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 132105fb2b689..930918e967eed 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -331,6 +331,7 @@
 from .tensor.stat import var  # noqa: F401
 from .tensor.stat import numel  # noqa: F401
 from .tensor.stat import median  # noqa: F401
+from .tensor.stat import nanmedian  # noqa: F401
 from .tensor.stat import quantile  # noqa: F401
 from .tensor.stat import nanquantile  # noqa: F401
 from .device import get_cudnn_version  # noqa: F401
@@ -498,6 +499,7 @@
            'load',
            'numel',
            'median',
+           'nanmedian',
            'quantile',
            'nanquantile',
            'no_grad',
diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py
index b33dc1aaeb086..8cb4f5f765611 100644
--- a/python/paddle/device/cuda/__init__.py
+++ b/python/paddle/device/cuda/__init__.py
@@ -224,7 +224,7 @@ def max_memory_allocated(device=None):
             f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
         )
     device_id = extract_cuda_device_id(device, op_name=name)
-    return core.memory_stat_get_peak("Allocated", device_id)
+    return core.device_memory_stat_peak_value("Allocated", device_id)
 
 
 def max_memory_reserved(device=None):
@@ -255,7 +255,7 @@ def max_memory_reserved(device=None):
             f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
         )
     device_id = extract_cuda_device_id(device, op_name=name)
-    return core.memory_stat_get_peak("Reserved", device_id)
+    return core.device_memory_stat_peak_value("Reserved", device_id)
 
 
 def memory_allocated(device=None):
@@ -290,7 +290,7 @@ def memory_allocated(device=None):
             f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
         )
     device_id = extract_cuda_device_id(device, op_name=name)
-    return core.memory_stat_get_current("Allocated", device_id)
+    return core.device_memory_stat_current_value("Allocated", device_id)
 
 
 def memory_reserved(device=None):
@@ -321,7 +321,7 @@ def memory_reserved(device=None):
             f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
         )
     device_id = extract_cuda_device_id(device, op_name=name)
-    return core.memory_stat_get_current("Reserved", device_id)
+    return core.device_memory_stat_current_value("Reserved", device_id)
 
 
 def _set_current_stream(stream):
diff --git a/python/paddle/distributed/auto_parallel/completion.py b/python/paddle/distributed/auto_parallel/completion.py
index 31bdc4cc650af..465c450c0b076 100644
--- a/python/paddle/distributed/auto_parallel/completion.py
+++ b/python/paddle/distributed/auto_parallel/completion.py
@@ -20,7 +20,7 @@
 from paddle.fluid import framework
 
 from .utils import print_program_with_dist_attr
-from .operators import find_best_compatible_distributed_operator_impl
+from .operators import find_compatible_distributed_operator_impls
 from .dist_context import get_default_distributed_context, _node_id
 from .dist_tensor import DistributedTensor
 from .dist_op import DistributedOperator
@@ -238,13 +238,17 @@ def _update_op_node_dims_mapping(self, op_node, fwd=True):
                             tensor_desc.name())
                         compatible_dims_mapping = compute_compatible_dims_mapping(
                             [op_dims_mapping, tensor_dims_mapping])
+                        if not _validate_dims_mapping(
+                                compatible_dims_mapping,
+                                op_dist_attr.process_mesh):
+                            continue
                         if (compatible_dims_mapping is not None) and \
                             (compatible_dims_mapping != op_dims_mapping):
                             op_dist_attr.set_input_dims_mapping(
                                 tensor_desc.name(), compatible_dims_mapping)
                             changed = True
             # Find the most compatible implemenetations from the distributed operator
-            op_dist_impls = find_best_compatible_distributed_operator_impl(
+            op_dist_impls = find_compatible_distributed_operator_impls(
                 dist_op, fwd=True)
             if op_dist_impls is not None:
                 not_compatible = True
@@ -254,7 +258,8 @@ def _update_op_node_dims_mapping(self, op_node, fwd=True):
                     dim_changed = op_dist_impl.update_dims_mapping(dist_op)
                     if dim_changed:
                         changed = True
-                    if op_dist_impl.is_auto_compatible(dist_op):
+                    if op_dist_impl.is_auto_compatible(dist_op) \
+                        and dist_op.validate_dist_attr():
                         if op_dist_impl.type == "elementwise":
                             op_dist_attr.impl_type = "default"
                         else:
@@ -289,13 +294,17 @@ def _update_op_node_dims_mapping(self, op_node, fwd=True):
                             tensor_desc.name())
                         compatible_dims_mapping = compute_compatible_dims_mapping(
                             [op_dims_mapping, tensor_dims_mapping])
+                        if not _validate_dims_mapping(
+                                compatible_dims_mapping,
+                                op_dist_attr.process_mesh):
+                            continue
                         if (compatible_dims_mapping is not None) and \
                             (compatible_dims_mapping != op_dims_mapping):
                             op_dist_attr.set_output_dims_mapping(
                                 tensor_desc.name(), compatible_dims_mapping)
                             changed = True
             # Find the most compatible implemenetations from the distributed operator
-            op_dist_impls = find_best_compatible_distributed_operator_impl(
+            op_dist_impls = find_compatible_distributed_operator_impls(
                 dist_op, fwd=False)
             if op_dist_impls is not None:
                 not_compatible = True
@@ -305,8 +314,8 @@ def _update_op_node_dims_mapping(self, op_node, fwd=True):
                     dim_changed = op_dist_impl.update_dims_mapping(dist_op)
                     if dim_changed:
                         changed = True
-                    if op_dist_impl.is_auto_compatible(dist_op):
-                        not_compatible = False
+                    if op_dist_impl.is_auto_compatible(dist_op) \
+                        and dist_op.validate_dist_attr():
                         if op_dist_impl.type == "elementwise":
                             op_dist_attr.impl_type = "default"
                         else:
@@ -352,6 +361,23 @@ def _update_dims_mapping_between_graphs(self):
                 changed = True
         return changed
 
+    def _update_dims_mapping_for_special(self):
+        # Set the dims_mapping of a tensor to the dims_mapping inside the op which produces it
+        op_nodes = self._dist_context._serial_ordered_op_nodes
+        for op_node in op_nodes:
+            op_dist_attr = self._dist_context.get_dist_attr_for_graph(op_node)
+            for tensor_node in op_node.outputs:
+                if tensor_node.is_var() and tensor_node.var() is not None:
+                    if tensor_node.var().type() == core.VarDesc.VarType.READER:
+                        continue
+                    tensor_desc = tensor_node.var()
+                    tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph(
+                        tensor_node)
+                    if op_dist_attr.process_mesh == tensor_dist_attr.process_mesh:
+                        op_dims_mapping = op_dist_attr.get_output_dims_mapping(
+                            tensor_desc.name())
+                        tensor_dist_attr.dims_mapping = op_dims_mapping
+
     def _update_dims_mapping(self):
         # Complete dims_mapping for each node
         reach_fix_point = False
@@ -378,6 +404,7 @@ def _update_dims_mapping(self):
                 reach_fix_point = False
             else:
                 reach_fix_point = True
+        self._update_dims_mapping_for_special()
 
     def _update_process_mesh_by_nearest(self, op_node, nearest_op_node):
         op_dist_attr = self._dist_context.get_dist_attr_for_graph(op_node)
@@ -685,7 +712,7 @@ def _update_process_mesh(self):
         # Step 3: adjust the process meshes for special ops
         self._update_process_mesh_for_specials()
 
-        # Step 4: adjust the process meshes between graphs 
+        # Step 4: adjust the process meshes between graphs
         self._update_process_mesh_between_graphs()
 
     def _prepare(self):
@@ -727,14 +754,14 @@ def complete_forward_annotation(self, serial_main_program=None):
         """ Complete annotation for the partial annotated serial_main_program.
         Arguments:
             serial_main_program: partial annotated serial_main_program.
-        Returns:
+        Returns:e
             serial_main_program: completed annotated serial_main_program.
         """
 
         if serial_main_program is None:
             serial_main_program = self._dist_context.serial_main_program
         else:
-            self._dist_context.serial_main_program = serial_main_program
+            self._dist_context._serial_main_program = serial_main_program
 
         self._dist_context.initialize()
 
@@ -757,13 +784,18 @@ def complete_forward_annotation(self, serial_main_program=None):
 
         return serial_main_program
 
-    def _complete_high_order_grad_annotation(self, serial_main_program):
+    def _complete_high_order_grad_annotation(self, serial_main_program=None):
         """
         NOTE: 
             [HighOrderGrad] Complete the annotation of vars and ops only for high order gradient.
             This function is temporary to support high order gradient, and will be removed in the future.
         """
 
+        if serial_main_program is None:
+            serial_main_program = self._dist_context.serial_main_program
+        else:
+            self._dist_context._serial_main_program = serial_main_program
+
         def _is_grad_var_name(name):
             if "@GRAD" in name:
                 return True
@@ -771,7 +803,7 @@ def _is_grad_var_name(name):
 
         def _get_op_by_id(ops, id):
             for op in ops:
-                if op.desc.id() == id:
+                if op.desc.original_id() == id:
                     return op
             return None
 
@@ -796,10 +828,12 @@ def _get_op_by_id(ops, id):
             # complete the annotation of grad op (xxx_grad op or sum op)
             # xxx_grad op will have a corresponding forward op in grad_op_id_to_op_id
             grad_op = ops[idx]
-            if grad_op.desc.id() in dist_op_context.grad_op_id_to_op_id:
+            if grad_op.desc.original_id(
+            ) in dist_op_context.grad_op_id_to_op_id:
                 # TODO support the case where one forward op corresponding to multiple xxx_grad op
-                forward_op = _get_op_by_id(
-                    ops, dist_op_context.grad_op_id_to_op_id[grad_op.desc.id()])
+                forward_op = _get_op_by_id(ops,
+                                           dist_op_context.grad_op_id_to_op_id[
+                                               grad_op.desc.original_id()])
                 assert forward_op is not None
 
                 fwd_op_dist_attr = self._dist_context.get_op_dist_attr_for_program(
@@ -915,12 +949,13 @@ def _get_op_by_id(ops, id):
                 self._dist_context.set_op_dist_attr_for_program(
                     grad_op, grad_op_dist_attr)
 
-    def complete_backward_annotation(self, serial_main_program):
+    def complete_backward_annotation(self, serial_main_program=None):
         """Complete the annotation of vars and ops in the backward phase for parallel program."""
+
         if serial_main_program is None:
             serial_main_program = self._dist_context.serial_main_program
         else:
-            self._dist_context.serial_main_program = serial_main_program
+            self._dist_context._serial_main_program = serial_main_program
 
         def _is_grad_var_name(name):
             if "@GRAD" in name:
@@ -935,7 +970,7 @@ def _get_forward_varname_from_grad_varname(grad_var_name):
 
         def _get_op_by_id(ops, id):
             for op in ops:
-                if op.desc.id() == id:
+                if op.desc.original_id() == id:
                     return op
             return None
 
@@ -997,11 +1032,12 @@ def _get_op_by_id(ops, id):
             # complete the annotation of grad op (xxx_grad op or sum op)
             # xxx_grad op will have a corresponding forward op in grad_op_id_to_op_id
             grad_op = ops[idx]
-            if grad_op.desc.id() in dist_op_context.grad_op_id_to_op_id:
+            if grad_op.desc.original_id(
+            ) in dist_op_context.grad_op_id_to_op_id:
                 # TODO support the case where one forward op corresponding to multiple xxx_grad op
-                forward_op = _get_op_by_id(
-                    ops[:first_backward_op_idx],
-                    dist_op_context.grad_op_id_to_op_id[grad_op.desc.id()])
+                forward_op = _get_op_by_id(ops[:first_backward_op_idx],
+                                           dist_op_context.grad_op_id_to_op_id[
+                                               grad_op.desc.original_id()])
                 assert forward_op is not None
 
                 if grad_op.type == "concat" and forward_op.type == "split":
@@ -1029,6 +1065,9 @@ def _get_op_by_id(ops, id):
                     grad_op_dist_attr.process_mesh = ref_mesh
                     self._dist_context.set_op_dist_attr_for_program(
                         grad_op, grad_op_dist_attr)
+                    grad_op_dist_attr.impl_type = fwd_op_dist_attr.impl_type
+                    grad_op_dist_attr.impl_idx = fwd_op_dist_attr.impl_idx
+
                     continue
 
                 fwd_op_dist_attr = self._dist_context.get_op_dist_attr_for_program(
@@ -1075,6 +1114,8 @@ def _get_op_by_id(ops, id):
                     grad_op_dist_attr.set_output_dims_mapping(output_name,
                                                               ref_dims_mapping)
 
+                grad_op_dist_attr.impl_type = fwd_op_dist_attr.impl_type
+                grad_op_dist_attr.impl_idx = fwd_op_dist_attr.impl_idx
                 self._dist_context.set_op_dist_attr_for_program(
                     grad_op, grad_op_dist_attr)
 
@@ -1108,6 +1149,8 @@ def _get_op_by_id(ops, id):
                             var_name, ref_fwd_dims_mapping)
                     grad_op_dist_attr.set_output_dims_mapping(
                         output_name, ref_fwd_dims_mapping)
+                    grad_op_dist_attr.impl_type = "default"
+                    grad_op_dist_attr.impl_idx = 0
 
                 elif grad_op.type == 'fill_zeros_like':
                     ref_var_name = grad_op.input_arg_names[0]
@@ -1139,12 +1182,13 @@ def _get_op_by_id(ops, id):
                 self._dist_context.set_op_dist_attr_for_program(
                     grad_op, grad_op_dist_attr)
 
-    def complete_update_annotation(self, serial_main_program=None):
+    def complete_update_annotation(self, serial_main_program):
         """Complete the annotation of vars and ops in the update phase for parallel program."""
-        if serial_main_program is None:
-            serial_main_program = self._dist_context.serial_main_program
-        else:
-            self._dist_context.serial_main_program = serial_main_program
+
+        # Notice: serial_main_program is actually a dist_main_program of current rank,
+        # and must be passed into this function. 
+        # TODO: We should fix this behavior.
+
         ops = list(serial_main_program.global_block().ops)
         vars = serial_main_program.global_block().vars
         learning_rate_completed = False
@@ -1301,7 +1345,7 @@ def _init_global_mesh_for_program(self):
                 dist_op.dist_attr.process_mesh = world_ranks
 
                 # Find the most compatible implemenetations from the distributed operator
-                op_dist_impls = find_best_compatible_distributed_operator_impl(
+                op_dist_impls = find_compatible_distributed_operator_impls(
                     dist_op, fwd=True)
                 if op_dist_impls is not None:
                     backup_op_dist_attr = copy.deepcopy(dist_op.dist_attr)
diff --git a/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py b/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py
index 28d2e2d5a3088..8958c4bf905c2 100644
--- a/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py
+++ b/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py
@@ -23,7 +23,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(AssignOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -41,7 +41,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(AssignValueOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -59,7 +59,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(BeamSearchOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -77,7 +77,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(BeamSearchDecodeOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -95,7 +95,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(CastOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -113,7 +113,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(ConcatOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -131,7 +131,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(ElementwiseAddOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -149,7 +149,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(ElementwiseAddGradOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -167,7 +167,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(ElementwiseDivOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -185,7 +185,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(ElementwiseDivGradOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -203,7 +203,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(ElementwiseMulOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -221,7 +221,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(ElementwiseMulGradOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -239,7 +239,25 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(ElementwiseSubOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class ElementwiseSubGradOpCost(CompOpCost):
+    OP_TYPE = "elementwise_sub_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(ElementwiseSubGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -257,7 +275,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(EmbeddingOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -275,7 +293,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(EmbeddingGradOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -293,7 +311,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(FillConstantOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -311,7 +329,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(FillConstantBatchSizeLikeOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -329,7 +347,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(FillConstantBatchSizeLikeGradOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -347,7 +365,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(GatherOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -365,7 +383,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(GeluOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -383,7 +401,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(GeluGradOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -401,7 +419,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(GreaterEqualOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -419,7 +437,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(IncrementOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -433,7 +451,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(IsEmptyOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -447,7 +465,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(LayerNormOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -465,7 +483,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(LayerNormGradOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -483,7 +501,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(LessThanOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -501,7 +519,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(LogicalNotOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -519,7 +537,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(LogicalAndOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -537,7 +555,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(LodResetOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -554,7 +572,7 @@ class LogOpCost(CompOpCost):
     def __init__(self, op=None, op_desc=None, cluster=None):
         super(LogOpCost, self).__init__(op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -572,7 +590,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(LookupTableV2OpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -590,7 +608,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(LookupTableV2GradOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -608,7 +626,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(MatmulOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -626,7 +644,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(MatmulGradOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -644,7 +662,527 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(MatmulV2OpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class MatmulV2GradOpCost(CompOpCost):
+    OP_TYPE = "matmul_v2_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(MatmulV2GradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class MemcpyOpCost(CompOpCost):
+    OP_TYPE = "memcpy"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(MemcpyOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class MulOpCost(CompOpCost):
+    OP_TYPE = "mul"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(MulOpCost, self).__init__(op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class MulGradOpCost(CompOpCost):
+    OP_TYPE = "mul_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(MulGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class OneHotOpCost(CompOpCost):
+    OP_TYPE = "one_hot"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(OneHotOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class ReadFromArrayOpCost(CompOpCost):
+    OP_TYPE = "read_from_array"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(ReadFromArrayOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class ReduceSumOpCost(CompOpCost):
+    OP_TYPE = "reduce_sum"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(ReduceSumOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class ReduceSumGradOpCost(CompOpCost):
+    OP_TYPE = "reduce_sum_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(ReduceSumGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class Reshape2OpCost(CompOpCost):
+    OP_TYPE = "reshape2"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(Reshape2OpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class Reshape2GradOpCost(CompOpCost):
+    OP_TYPE = "reshape2_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(Reshape2GradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class ReduceMeanOpCost(CompOpCost):
+    OP_TYPE = "reduce_mean"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(ReduceMeanOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class ReduceMeanGradOpCost(CompOpCost):
+    OP_TYPE = "reduce_mean_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(ReduceMeanGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class SamplingIdOpCost(CompOpCost):
+    OP_TYPE = "sampling_id"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(SamplingIdOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class ScaleOpCost(CompOpCost):
+    OP_TYPE = "scale"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(ScaleOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class SliceOpCost(CompOpCost):
+    OP_TYPE = "slice"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(SliceOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class SoftmaxOpCost(CompOpCost):
+    OP_TYPE = "softmax"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(SoftmaxOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class SoftmaxGradOpCost(CompOpCost):
+    OP_TYPE = "softmax_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(SoftmaxGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class SoftmaxWithCrossEntropyOpCost(CompOpCost):
+    OP_TYPE = "softmax_with_cross_entropy"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(SoftmaxWithCrossEntropyOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class SoftmaxWithCrossEntropyGradOpCost(CompOpCost):
+    OP_TYPE = "softmax_with_cross_entropy_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(SoftmaxWithCrossEntropyGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class SplitOpCost(CompOpCost):
+    OP_TYPE = "split"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(SplitOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class Squeeze2OpCost(CompOpCost):
+    OP_TYPE = "squeeze2"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(Squeeze2OpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class SquareOpCost(CompOpCost):
+    OP_TYPE = "square"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(SquareOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class SquareGradOpCost(CompOpCost):
+    OP_TYPE = "square_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(SquareGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class SumOpCost(CompOpCost):
+    OP_TYPE = "sum"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(SumOpCost, self).__init__(op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class TopKOpCost(CompOpCost):
+    OP_TYPE = "top_k"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(TopKOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class Transpose2OpCost(CompOpCost):
+    OP_TYPE = "transpose2"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(Transpose2OpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class Transpose2GradOpCost(CompOpCost):
+    OP_TYPE = "transpose2_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(Transpose2GradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class Unsqueeze2OpCost(CompOpCost):
+    OP_TYPE = "unsqueeze2"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(Unsqueeze2OpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class WriteToArrayOpCost(CompOpCost):
+    OP_TYPE = "write_to_array"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(WriteToArrayOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
diff --git a/python/paddle/distributed/auto_parallel/dist_attribute.py b/python/paddle/distributed/auto_parallel/dist_attribute.py
index 6fa5b756c75c3..3dbdb79f48541 100644
--- a/python/paddle/distributed/auto_parallel/dist_attribute.py
+++ b/python/paddle/distributed/auto_parallel/dist_attribute.py
@@ -132,15 +132,17 @@ def init(self, dist_attr):
                         key, dist_attr)
             self._is_annotated = copy.deepcopy(dist_attr._is_annotated)
 
-    # def reset(self, skip_dist_attr_field_names):
-    #     if skip_dist_attr_field_names is not None \
-    #         and "process_mesh" not in skip_dist_attr_field_names:
-    #         self._process_mesh = None
-    #     if skip_dist_attr_field_names is not None \
-    #         and "dims_mapping" not in skip_dist_attr_field_names:
-    #         for i in enumerate(self._dims_mapping):
-    #             self._dims_mapping[i] = -1
-    #     self._is_annotated = {}
+    def reset(self, skip_dist_attr_field_names=None):
+        if skip_dist_attr_field_names is None or \
+            (skip_dist_attr_field_names is not None \
+                and "process_mesh" not in skip_dist_attr_field_names):
+            self._process_mesh = None
+        if skip_dist_attr_field_names is None or \
+            (skip_dist_attr_field_names is not None \
+                and "dims_mapping" not in skip_dist_attr_field_names):
+            for i, _ in enumerate(self._dims_mapping):
+                self._dims_mapping[i] = -1
+        self._is_annotated = {}
 
     def is_annotated(self, dist_attr_field_name):
         return self._is_annotated.get(dist_attr_field_name, False)
@@ -272,6 +274,9 @@ def set_input_dist_attr(self, name, dist_attr):
         dist_attr_object.init(dist_attr)
         self._inputs_dist_attrs[name] = dist_attr_object
 
+    # def del_input_dist_attr(self, name):
+    #     del self._inputs_dist_attrs[name]
+
     def get_output_dist_attr(self, name):
         return self._outputs_dist_attrs.get(name, None)
 
@@ -280,6 +285,9 @@ def set_output_dist_attr(self, name, dist_attr):
         dist_attr_object.init(dist_attr)
         self._outputs_dist_attrs[name] = dist_attr_object
 
+    # def del_output_dist_attr(self, name):
+    #     del self._inputs_dist_attrs[name]
+
     def get_input_dims_mapping(self, name):
         input_dist_attr = self.get_input_dist_attr(name)
         if input_dist_attr:
@@ -374,17 +382,18 @@ def init(self, dist_attr):
                         "ProcessMeshes in DistributedOperator must be the same."
         self.process_mesh = shared_process_mesh
 
-    # def reset(self, skip_dist_attr_field_names):
-    #     for tensor_dist_attr in self.inputs_dist_attrs.values():
-    #         tensor_dist_attr.reset(skip_dist_attr_field_names)
-    #     for tensor_dist_attr in self.outputs_dist_attrs.values():
-    #         tensor_dist_attr.reset(skip_dist_attr_field_names)
-    #     if skip_dist_attr_field_names is not None \
-    #         and "process_mesh" not in skip_dist_attr_field_names:
-    #         self.process_mesh = None
-    #     self.impl_type = "default"
-    #     self.impl_idx = 0
-    #     self._is_annotated = {}
+    def reset(self, skip_dist_attr_field_names=None):
+        for tensor_dist_attr in self.inputs_dist_attrs.values():
+            tensor_dist_attr.reset(skip_dist_attr_field_names)
+        for tensor_dist_attr in self.outputs_dist_attrs.values():
+            tensor_dist_attr.reset(skip_dist_attr_field_names)
+        if skip_dist_attr_field_names is None or \
+            (skip_dist_attr_field_names is not None \
+                and "process_mesh" not in skip_dist_attr_field_names):
+            self._process_mesh = None
+        self.impl_type = "default"
+        self.impl_idx = 0
+        self._is_annotated = {}
 
     def is_annotated(self, attr_name):
         return self._is_annotated.get(attr_name, False)
diff --git a/python/paddle/distributed/auto_parallel/dist_context.py b/python/paddle/distributed/auto_parallel/dist_context.py
index 7299f84504bf3..6a38b53cf2c10 100644
--- a/python/paddle/distributed/auto_parallel/dist_context.py
+++ b/python/paddle/distributed/auto_parallel/dist_context.py
@@ -57,33 +57,30 @@ def __init__(self,
                  serial_startup_prog=None,
                  serial_optimizer=None,
                  serial_loss=None,
-                 feed_vars=None,
-                 fetch_vars=None,
+                 feed_vars={},
+                 fetch_vars={},
+                 cluster=None,
                  strategy=None):
         # Data members related to original programs (unchanged)
         self._original_serial_main_program = serial_main_prog
         self._original_serial_startup_program = serial_startup_prog
+        self._original_serial_optimizer = serial_optimizer
         self._original_serial_loss = serial_loss
+        self._original_serial_feed_vars = feed_vars
+        self._original_serial_fetch_vars = fetch_vars
         self._original_serial_optimizer = serial_optimizer
-        if self._original_serial_main_program is None:
-            self._original_serial_main_program = paddle.fluid.default_main_program(
-            )
-        if self._original_serial_startup_program is None:
-            self._original_serial_startup_program = paddle.fluid.default_startup_program(
-            )
 
         # Data members related to programs (changed)
         self._serial_main_program = None
         self._serial_startup_program = None
-        self._serial_loss = serial_loss
-        self._serial_optimizer = serial_optimizer
-        self._serial_feed_vars = feed_vars
-        self._serial_fetch_vars = fetch_vars
+        self._serial_loss = None
+        self._serial_optimizer = None
+        self._serial_feed_vars = {}
+        self._serial_fetch_vars = {}
 
         # Data members related to the program
         self._dist_tensors_for_program = {}
         self._dist_ops_for_program = {}
-        self._block_state = BlockState()
 
         # Data members related to the graph
         self._serial_graph = None
@@ -96,24 +93,30 @@ def __init__(self,
         # Distributed programs
         self._dist_main_programs = {}
         self._dist_startup_programs = {}
+        self._dist_op_context = DistributedOperatorContext()
+        self._process_meshes = []
 
-        # Distributed Strategy
+        self._cluster = cluster
         self._strategy = strategy
 
         # Pass Context
         self._pass_context = PassContext()
-
-        # Distributed Operator Context
-        self._dist_op_context = DistributedOperatorContext()
+        self._block_state = BlockState()
 
         # Other data members
-        self._process_meshes = []
         self._serial_ordered_tensor_nodes = []
         self._serial_ordered_op_nodes = []
         self._serial_ordered_nodes = []
         # self._tensor_id_to_tensor_node_ids = {}
 
         self._is_initialized = False
+        self._need_copy_dist_attr_to_graph = False
+        self._backup_pass_context_stack = []
+        self._backup_block_state_stack = []
+        self._backup_dist_tensors_for_program_stack = []
+        self._backup_dist_ops_for_program_stack = []
+        self._backup_serial_main_program_stack = []
+        self._backup_serial_startup_program_stack = []
 
         # flag whether scale gradient with dp size
         self._gradient_scale = True
@@ -122,13 +125,6 @@ def __init__(self,
     def serial_main_program(self):
         return self._serial_main_program
 
-    @serial_main_program.setter
-    def serial_main_program(self, program):
-        # if self._serial_main_program:
-        #     print("WARNING: The program attached to this distributed context will be replaced by the new one.")
-        self._original_serial_main_program = program
-        self._serial_main_program = program
-
     @property
     def serial_startup_program(self):
         return self._serial_startup_program
@@ -149,6 +145,18 @@ def serial_feed_vars(self):
     def serial_fetch_vars(self):
         return self._serial_fetch_vars
 
+    @property
+    def dist_main_programs(self):
+        return self._dist_main_programs
+
+    @property
+    def dist_startup_programs(self):
+        return self._dist_startup_programs
+
+    @property
+    def cluster(self):
+        return self._cluster
+
     @property
     def strategy(self):
         return self._strategy
@@ -177,14 +185,6 @@ def dist_op_context(self):
     def block_state(self):
         return self._block_state
 
-    @property
-    def dist_main_programs(self):
-        return self._dist_main_programs
-
-    @property
-    def dist_startup_programs(self):
-        return self._dist_startup_programs
-
     @property
     def has_annotation(self):
         return len(self._dist_tensors_for_program) or len(
@@ -198,17 +198,168 @@ def gradient_scale(self):
     def gradient_scale(self, gs):
         self._gradient_scale = gs
 
-    def initialize(self):
-        if not self._is_initialized:
+    def _backup_serial_info(self, mode):
+        self._backup_serial_main_program_stack.append(
+            self._serial_main_program.clone())
+        self._backup_serial_startup_program_stack.append(
+            self._serial_startup_program.clone())
+        self._backup_pass_context_stack.append(
+            copy.deepcopy(self._pass_context))
+        self._backup_block_state_stack.append(copy.deepcopy(self._block_state))
+
+    def _backup_dist_info(self, mode):
+        self._backup_dist_tensors_for_program_stack.append(
+            copy.deepcopy(self._dist_tensors_for_program))
+        self._backup_dist_ops_for_program_stack.append(
+            copy.deepcopy(self._dist_ops_for_program))
+
+    def _backup(self, serial=True, serial_mode=None, dist=True, dist_mode=None):
+        # Use this function carefully
+        if serial:
+            self._backup_serial_info(serial_mode)
+        if dist:
+            self._backup_dist_info(dist_mode)
+
+    def _restore_serial_info(self, mode="to_backup"):
+        if mode == "to_backup":
+            self._serial_main_program = self._backup_serial_main_program_stack.pop(
+            )
+            self._serial_startup_program = self._backup_serial_startup_program_stack.pop(
+            )
+        elif mode == "to_original":
+            assert self._original_serial_main_program is not None
+            assert self._original_serial_startup_program is not None
             self._serial_main_program = self._original_serial_main_program.clone(
             )
             self._serial_startup_program = self._original_serial_startup_program.clone(
             )
-            self._serial_main_program = self._original_serial_main_program
-            self._serial_startup_program = self._original_serial_startup_program
-            self._serial_loss = self._original_serial_loss
-            self._serial_optimizer = self._original_serial_optimizer
+
+        self._serial_optimizer = self._original_serial_optimizer
+
+        if self._original_serial_loss:
+            if isinstance(self._original_serial_loss, list):
+                assert len(self._original_serial_loss) == 1
+                loss = self._original_serial_loss[0]
+                block_idx = loss.block.idx
+                var_name = loss.name
+                var = self._serial_main_program.blocks[
+                    block_idx]._var_recursive(var_name)
+                self._serial_loss = var
+            else:
+                block_idx = self._original_serial_loss.block.idx
+                var_name = self._original_serial_loss.name
+                var = self._serial_main_program.blocks[
+                    block_idx]._var_recursive(var_name)
+                self._serial_loss = var
+
+        for key, var_list in self._original_serial_feed_vars.items():
+            new_var_list = []
+            for var in var_list:
+                block_idx = var.block.idx
+                var_name = var.name
+                var = self._serial_main_program.blocks[
+                    block_idx]._var_recursive(var_name)
+                new_var_list.append(var)
+            self._serial_feed_vars[key] = new_var_list
+
+        for key, var_list in self._original_serial_fetch_vars.items():
+            new_var_list = []
+            for var in var_list:
+                block_idx = var.block.idx
+                var_name = var.name
+                var = self._serial_main_program.blocks[
+                    block_idx]._var_recursive(var_name)
+                new_var_list.append(var)
+            self._serial_fetch_vars[key] = new_var_list
+
+        self._pass_context = self._backup_pass_context_stack.pop()
+        self._block_state = self._backup_block_state_stack.pop()
+
+    def _restore_dist_info(self, mode="to_backup"):
+        if mode == "to_backup":
+            self._dist_tensors_for_program = self._backup_dist_tensors_for_program_stack.pop(
+            )
+            self._dist_ops_for_program = self._backup_dist_ops_for_program_stack.pop(
+            )
+        elif mode == "to_original":
+            assert self._original_dist_tensors_for_program
+            assert self._original_dist_ops_for_program
+            self._dist_tensors_for_program = copy.deepcopy(
+                self._original_dist_tensors_for_program)
+            self._dist_ops_for_program = copy.deepcopy(
+                self._original_dist_ops_for_program)
+        elif mode == "to_default":
+            new_tensors_ids = []
+            for tensor_id, dist_tensor in self._dist_tensors_for_program.items(
+            ):
+                if tensor_id in self._tensors_ids:
+                    dist_tensor.dist_attr.reset()
+                else:
+                    new_tensors_ids.append(tensor_id)
+            for tensor_id in new_tensors_ids:
+                self._dist_tensors_for_program.pop(tensor_id)
+            new_ops_ids = []
+            for op_id, dist_op in self._dist_ops_for_program.items():
+                if op_id in self._ops_ids:
+                    dist_op.dist_attr.reset()
+                else:
+                    new_ops_ids.append(op_id)
+            for op_id in new_ops_ids:
+                self._dist_ops_for_program.pop(op_id)
+        else:
+            new_tensors_ids = []
+            for tensor_id, dist_tensor in self._dist_tensors_for_program.items(
+            ):
+                new_tensors_ids.append(tensor_id)
+            for tensor_id in new_tensors_ids:
+                self._dist_tensors_for_program.pop(tensor_id)
+            new_ops_ids = []
+            for op_id, dist_op in self._dist_ops_for_program.items():
+                new_ops_ids.append(op_id)
+            for op_id in new_ops_ids:
+                self._dist_ops_for_program.pop(op_id)
+        self._dist_main_programs = {}
+        self._dist_startup_programs = {}
+        self._dist_op_context = DistributedOperatorContext()
+        self._need_copy_dist_attr_to_graph = True
+        self._process_meshes = []
+
+    def _restore(self,
+                 serial=True,
+                 serial_mode="to_backup",
+                 dist=True,
+                 dist_mode="to_backup"):
+        # Use this function carefully
+        if serial:
+            self._restore_serial_info(serial_mode)
+        if dist:
+            self._restore_dist_info(dist_mode)
+
+    def initialize(self):
+        if not self._is_initialized:
+            if not self._serial_main_program:
+                self._serial_main_program = self._original_serial_main_program
+            if not self._serial_startup_program:
+                self._serial_startup_program = self._original_serial_startup_program
+            if not self._serial_loss:
+                if isinstance(self._original_serial_loss, list):
+                    assert len(self._original_serial_loss) == 1
+                    self._serial_loss = self._original_serial_loss[0]
+                else:
+                    self._serial_loss = self._original_serial_loss
+            if not self._serial_optimizer:
+                self._serial_optimizer = self._original_serial_optimizer
+            if not self._serial_feed_vars:
+                self._serial_feed_vars = self._original_serial_feed_vars
+            if not self._serial_fetch_vars:
+                self._serial_fetch_vars = self._original_serial_fetch_vars
+
             self._init_dist_attr_for_program()
+            # Backup the original distributed information for later restore
+            self._original_dist_tensors_for_program = copy.deepcopy(
+                self._dist_tensors_for_program)
+            self._original_dist_ops_for_program = copy.deepcopy(
+                self._dist_ops_for_program)
             self._tensors_ids = list(self._dist_tensors_for_program.keys())
             self._ops_ids = list(self._dist_ops_for_program.keys())
             set_flags({"FLAGS_convert_all_blocks": True})
@@ -216,41 +367,9 @@ def initialize(self):
                 core.Graph(self._serial_main_program.desc))
             self._init_dist_attr_for_graph()
             self._is_initialized = True
-
-    # def reset(self,
-    #           skip_dist_tensors=None,
-    #           skip_dist_ops=None,
-    #           skip_tensor_dist_attr_fields=None,
-    #           skip_op_dist_attr_fields=None):
-    #     self._serial_main_program = self._original_serial_main_program.clone()
-    #     self._serial_startup_program = self._original_serial_startup_program.clone()
-    #     new_tensors_ids = []
-    #     for tensor_id, dist_tensor in self._dist_tensors_for_program.items():
-    #         if tensor_id in self._tensors_ids:
-    #             dist_tensor.dist_attr.reset(skip_tensor_dist_attr_fields)
-    #         else:
-    #             new_tensors_ids.append(tensor_id)
-    #     for tensor_id in new_tensors_ids:
-    #         self._dist_tensors_for_program.pop(tensor_id)
-    #     new_ops_ids = []
-    #     for op_id, dist_op in self._dist_ops_for_program.items():
-    #         if op_id in self._ops_ids:
-    #             dist_op.dist_attr.reset(skip_op_dist_attr_fields)
-    #         else:
-    #             new_ops_ids.append(op_id)
-    #     for op_id in new_ops_ids:
-    #         self._dist_ops_for_program.pop(op_id)
-
-    #     self.copy_dist_attr_from_program_to_graph()
-
-    #     self._dist_main_programs = {}
-    #     self._dist_startup_programs = {}
-
-    #     self._pass_context = PassContext()
-
-    #     self._dist_op_context = DistributedOperatorContext()
-
-    #     self._process_meshes = []
+            self._need_copy_dist_attr_to_graph = False
+        if self._need_copy_dist_attr_to_graph:
+            self.copy_dist_attr_from_program_to_graph()
 
     def add_process_mesh(self, process_mesh):
         assert isinstance(process_mesh, ProcessMesh), \
@@ -419,6 +538,10 @@ def _init_dist_attr_for_program(self, no_default=False):
                 if current_dist_op is None:
                     dist_op = DistributedOperator(op)
                     self.add_dist_op_for_program(dist_op)
+        self._original_dist_tensors_for_program = copy.deepcopy(
+            self._dist_tensors_for_program)
+        self._original_dist_ops_for_program = copy.deepcopy(
+            self._dist_ops_for_program)
 
     def _order_nodes_by_program_order(self):
         def _contains(nodes, target_node):
@@ -588,7 +711,7 @@ def copy_dist_attr_from_graph_to_program(self):
                 op_dist_attr_for_graph = self.get_op_dist_attr_for_graph(node)
                 dist_op_for_program = self._dist_ops_for_program[op_id]
                 dist_op_for_program.dist_attr = op_dist_attr_for_graph
-        # TODO: the completion algorithm will skip orphan tensors,
+        # TODO: the completion algorithm will skipped orphan tensors,
         # here we just set there process_mesh to the first one.
         for orphan_node in self._serial_orphan_tensor_nodes:
             serial_tensor_id = orphan_node.var().id()
@@ -614,16 +737,21 @@ def amend_dist_attr_for_program(self):
                 tensor_shape = serial_tensor.shape
             dims_mapping = dist_attr.dims_mapping
             process_mesh_shape = dist_attr.process_mesh.topology
+            process_mesh_processes = dist_attr.process_mesh.processes
             # If the dimension of tensor is less than the sharding dimension of process mesh,
             # we just amend the dimension mapping to -1. (Is this really OK?)
             for i in range(len(tensor_shape)):
                 if dims_mapping[i] != -1 and tensor_shape[i] > 0 \
                     and process_mesh_shape[dims_mapping[i]] > tensor_shape[i]:
                     dims_mapping[i] = -1
+                if dims_mapping[i] != -1 and len(process_mesh_processes) == 1:
+                    dims_mapping[i] = -1
 
         for dist_op in self._dist_ops_for_program.values():
             serial_op = dist_op.serial_op
             dist_attr = dist_op.dist_attr
+            process_mesh_shape = dist_attr.process_mesh.topology
+            process_mesh_processes = dist_attr.process_mesh.processes
             for arg_name in serial_op.input_arg_names:
                 if dist_op.get_serial_input(arg_name) is None:
                     tensor_shape = []
@@ -635,13 +763,15 @@ def amend_dist_attr_for_program(self):
                     else:
                         tensor_shape = dist_op.get_serial_input(arg_name).shape
                 dims_mapping = dist_attr.get_input_dims_mapping(arg_name)
-                process_mesh_shape = dist_attr.process_mesh.topology
                 # If the dimension of tensor is less than the sharding dimension of process mesh,
                 # we just amend the dimension mapping to -1. (Is this really OK?)
                 for i in range(len(tensor_shape)):
                     if dims_mapping[i] != -1 and tensor_shape[i] > 0 \
                         and process_mesh_shape[dims_mapping[i]] > tensor_shape[i]:
                         dims_mapping[i] = -1
+                    if dims_mapping[i] != -1 and len(
+                            process_mesh_processes) == 1:
+                        dims_mapping[i] = -1
             for arg_name in serial_op.output_arg_names:
                 if dist_op.get_serial_output(arg_name).type == core.VarDesc.VarType.READER \
                     or dist_op.get_serial_output(arg_name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
@@ -650,13 +780,18 @@ def amend_dist_attr_for_program(self):
                 else:
                     tensor_shape = dist_op.get_serial_output(arg_name).shape
                 dims_mapping = dist_attr.get_output_dims_mapping(arg_name)
-                process_mesh_shape = dist_attr.process_mesh.topology
                 # If the dimension of tensor is less than the sharding dimension of process mesh,
                 # we just amend the dimension mapping to -1. (Is this really OK?)
                 for i in range(len(tensor_shape)):
                     if dims_mapping[i] != -1 and tensor_shape[i] > 0 \
                         and process_mesh_shape[dims_mapping[i]] > tensor_shape[i]:
                         dims_mapping[i] = -1
+                    if dims_mapping[i] != -1 and len(
+                            process_mesh_processes) == 1:
+                        dims_mapping[i] = -1
+            if len(process_mesh_processes) == 1:
+                dist_op.dist_attr.impl_type = "default"
+                dist_op.dist_attr.impl_idx = 0
 
     def validate_dist_attr_for_program(self):
         if not self._is_initialized:
@@ -670,16 +805,20 @@ def validate_dist_attr_for_program(self):
                         dist_tensor.serial_tensor.name)
                 if (dist_tensor is not None) and (
                         not dist_tensor.validate_dist_attr()):
-                    assert False, "Tensor {} has a wrong distributed attributes {}.".format(
-                        dist_tensor.serial_tensor.name, dist_tensor.dist_attr)
+                    assert False, "Tensor {} (id: {}, original_id: {}) has a wrong distributed attributes {}.".format(
+                        dist_tensor.serial_tensor.name,
+                        dist_tensor.desc.id(),
+                        dist_tensor.desc.original_id(), dist_tensor.dist_attr)
             for op in block.ops:
                 dist_op = self.get_dist_op_for_program(op)
                 assert dist_op is not None, \
                     "Operator {} does not have a distributed attribute.".format(
                         dist_op.serial_op.type)
                 if (dist_op is not None) and (not dist_op.validate_dist_attr()):
-                    assert False, "Operator {} has a wrong distributed attributes {}.".format(
-                        dist_op.serial_op.type, dist_op.dist_attr)
+                    assert False, "Operator {} (id: {}, original_id: {}) has a wrong distributed attributes {} .".format(
+                        dist_op.serial_op.type,
+                        dist_op.serial_op.desc.id(),
+                        dist_op.serial_op.desc.original_id(), dist_op.dist_attr)
         return True
 
     def __deepcopy__(self, memo):
diff --git a/python/paddle/distributed/auto_parallel/dist_tensor.py b/python/paddle/distributed/auto_parallel/dist_tensor.py
index a42ce863492b3..e3f06da275182 100644
--- a/python/paddle/distributed/auto_parallel/dist_tensor.py
+++ b/python/paddle/distributed/auto_parallel/dist_tensor.py
@@ -41,7 +41,7 @@ def _validate_sizes_and_dist_attr(sizes,
                                       rank=None,
                                       shard_sizes=None):
         if not (isinstance(sizes, (list, tuple)) and
-                all(map(lambda x: isinstance(x, int) and x > 0, sizes))):
+                all(map(lambda x: isinstance(x, int) and x >= 0, sizes))):
             raise ValueError(
                 "The sizes must be list or tuple and item in sizes must be non-negative integer, but got {}".
                 format(sizes))
@@ -79,8 +79,11 @@ def get_local_sizes(global_sizes,
 
         local_sizes = []
         # for even sharding, the local sizes of every rank are equal
+
         for idx, item in enumerate(global_sizes):
-            if dims_mapping[idx] == -1:
+            # This is a trick to avoid dims_mapping is []
+            val = dims_mapping[idx] if idx < len(dims_mapping) else -1
+            if val == -1:
                 local_sizes.append(item)
             else:
                 local_sizes.append(item // topology[dims_mapping[idx]])
diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py
index c38953ca9e64d..ab9391cf66fdb 100644
--- a/python/paddle/distributed/auto_parallel/engine.py
+++ b/python/paddle/distributed/auto_parallel/engine.py
@@ -31,10 +31,11 @@
 from paddle.fluid.framework import Operator
 from paddle.fluid.framework import _current_expected_place as _get_device
 from paddle.fluid.dygraph.parallel import ParallelEnv
+from paddle.distributed import fleet
 from paddle.distributed.utils import get_logger
 from paddle.distributed.passes import new_pass, PassContext
 
-from .cluster import Cluster
+# from .cluster import Cluster, get_default_cluster
 from .planner_v2 import Planner
 from .parallelizer_v2 import Parallelizer
 from .dist_op import DistributedOperator
@@ -57,7 +58,11 @@ def __init__(self,
         self.inputs_spec = self._validate_spec(inputs_spec)
         self.labels_spec = self._validate_spec(labels_spec)
         self.cluster = cluster
+        # if self.cluster is None:
+        #     self.cluster = get_default_cluster()
         self.strategy = strategy
+        if self.strategy is None:
+            self.strategy = fleet.DistributedStrategy()
 
         self._executor = None
         self._cur_rank = paddle.distributed.get_rank()
@@ -69,11 +74,11 @@ def __init__(self,
         self._orig_main_prog = fluid.default_main_program()
         self._orig_startup_prog = fluid.default_startup_program()
         self._orig_dist_context = get_default_distributed_context()
+        self._dist_contexts = {}
         self._serial_main_progs = {}
         self._serial_startup_progs = {}
         self._dist_main_progs = defaultdict(dict)  # dist main programs
         self._dist_startup_progs = defaultdict(dict)  # dist startup programs
-        self._dist_contexts = {}
         self._feed_vars = {}
         self._fetch_vars = {}
 
@@ -104,11 +109,17 @@ def prepare(self,
             parallelizer.parallel(self._cur_rank)
         else:
             parallelizer.parallel_all()
-        # Get the distributed main programs and startup programs
+        # Get the current content from the distributed context 
+        self._serial_main_progs[mode] = self._dist_contexts[
+            mode].serial_main_program
+        self._serial_startup_progs[mode] = self._dist_contexts[
+            mode].serial_startup_program
         self._dist_main_progs[mode] = self._dist_contexts[
             mode].dist_main_programs
         self._dist_startup_progs[mode] = self._dist_contexts[
             mode].dist_startup_programs
+        self._feed_vars[mode] = self._dist_contexts[mode].serial_feed_vars
+        self._fetch_vars[mode] = self._dist_contexts[mode].serial_fetch_vars
         # Init comm and startup program
         self._initialize(mode)
 
@@ -135,20 +146,23 @@ def _build(self, mode):
             inputs = [self._set_data_parallel(var) for var in inputs]
             labels = [self._set_data_parallel(var) for var in labels]
 
-        self._feed_vars[mode] = {"inputs": inputs, "labels": labels}
+        # self._feed_vars[mode] = {"inputs": inputs, "labels": labels}
+        feed_vars = {"inputs": inputs, "labels": labels}
 
-        self._fetch_vars[mode] = {
+        # self._fetch_vars[mode] = {
+        #     "outputs": flatten(outputs),
+        #     "loss": losses,
+        #     "metrics": metrics
+        # }
+        fetch_vars = {
             "outputs": flatten(outputs),
             "loss": losses,
             "metrics": metrics
         }
 
-        self._serial_main_progs[mode] = serial_main_prog
-        self._serial_startup_progs[mode] = serial_startup_prog
         self._dist_contexts[mode] = DistributedContext(
-            self._serial_main_progs[mode], self._serial_startup_progs[mode],
-            self._optimizer, losses, self._feed_vars[mode],
-            self._fetch_vars[mode], self.strategy)
+            serial_main_prog, serial_startup_prog, self._optimizer, losses,
+            feed_vars, fetch_vars, self.cluster, self.strategy)
         self._dist_contexts[mode].gradient_scale = self._gradient_scale
 
     def _initialize(self, mode):
diff --git a/python/paddle/distributed/auto_parallel/operators/__init__.py b/python/paddle/distributed/auto_parallel/operators/__init__.py
index 3ff474697205e..295e3557df27d 100644
--- a/python/paddle/distributed/auto_parallel/operators/__init__.py
+++ b/python/paddle/distributed/auto_parallel/operators/__init__.py
@@ -16,7 +16,7 @@
 from .common import DistributedOperatorImpl
 from .common import register_distributed_operator_impl_container
 from .common import register_distributed_operator_impl
-from .common import find_best_compatible_distributed_operator_impl
+from .common import find_compatible_distributed_operator_impls
 from . import dist_embedding
 from . import dist_matmul
 from . import dist_reshape
diff --git a/python/paddle/distributed/auto_parallel/operators/common.py b/python/paddle/distributed/auto_parallel/operators/common.py
index 441eb88a9f1ee..6b3c655f293bd 100644
--- a/python/paddle/distributed/auto_parallel/operators/common.py
+++ b/python/paddle/distributed/auto_parallel/operators/common.py
@@ -157,9 +157,7 @@ def register_distributed_operator_impl(op_type, dist_impl):
         assert False, "Must register distributed operator registry first."
 
 
-def find_best_compatible_distributed_operator_impl(dist_op,
-                                                   fwd=True,
-                                                   partial=True):
+def find_compatible_distributed_operator_impls(dist_op, fwd=True, partial=True):
     """
     Here just return the first compatible implemention. 
     This will be improved by cost model in the future.
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_default.py b/python/paddle/distributed/auto_parallel/operators/dist_default.py
index 6d9b48ea1e87c..78f30422e742f 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_default.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_default.py
@@ -187,7 +187,7 @@ def is_auto_compatible(self, dist_op):
         for arg_name in op_desc.input_arg_names():
             serial_tensor = dist_op.get_serial_input(arg_name)
             dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
-            if serial_tensor.is_parameter:
+            if serial_tensor is not None and serial_tensor.is_parameter:
                 for mapping in dims_mapping:
                     if mapping != -1:
                         return False
@@ -217,7 +217,7 @@ def is_auto_compatible(self, dist_op):
         for arg_name in op_desc.output_arg_names():
             serial_tensor = dist_op.get_serial_output(arg_name)
             dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
-            if serial_tensor.is_parameter:
+            if serial_tensor is not None and serial_tensor.is_parameter:
                 for mapping in dims_mapping:
                     if mapping != -1:
                         return False
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py b/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py
index 89cd2c9d9e41a..4d52e5a94beb1 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py
@@ -22,7 +22,6 @@
 from .common import register_distributed_operator_impl
 from .common import set_comm_op_dist_attr_for_program
 from .dist_default import DistributedDefaultImpl0
-from ..reshard import Resharder
 from ..process_group import new_process_group
 from ..utils import is_dim_shard, is_dim_replicate, _get_corresponding_rank
 from ..utils import compute_compatible_dim_mapping, set_dist_op_desc_original_id, _get_comm_group
@@ -324,6 +323,8 @@ def backward(ctx, *args, **kwargs):
         process_mesh_shape = op_dist_attr.process_mesh.topology
         process_mesh_group = op_dist_attr.process_mesh.processes
         dims_mapping = [0] + [-1 for _ in range(len(new_X_grad.shape) - 1)]
+        from ..reshard import Resharder
+
         partition_idx = Resharder.compute_partition_index(
             rank_id, new_X_grad.shape, dims_mapping, process_mesh_shape,
             process_mesh_group)
diff --git a/python/paddle/distributed/auto_parallel/parallelizer_v2.py b/python/paddle/distributed/auto_parallel/parallelizer_v2.py
index 6a94bbd3130b9..218513323dffb 100644
--- a/python/paddle/distributed/auto_parallel/parallelizer_v2.py
+++ b/python/paddle/distributed/auto_parallel/parallelizer_v2.py
@@ -35,7 +35,7 @@ def __init__(self, mode, completer, dist_context):
         self._mode = mode
         self._completer = completer
         self._dist_context = dist_context
-        self._dist_context.initialize()
+        assert self._dist_context._is_initialized
         self._pass_context = self._dist_context.pass_context
         self._strategy = self._dist_context.strategy
 
@@ -43,7 +43,9 @@ def parallel_all(self):
         world_process_group = get_world_process_group()
         all_ranks = world_process_group.ranks
         for rank in all_ranks:
+            # self._dist_context._backup(serial=True, dist=True)
             self.parallel(rank)
+            # self._dist_context._restore(serial=True, dist=True)
 
     def parallel(self, rank):
         serial_main_program = self._dist_context.serial_main_program
@@ -51,13 +53,14 @@ def parallel(self, rank):
         serial_optimizer = self._dist_context.serial_optimizer
         if self._mode == "train" and serial_optimizer:
             # Generate backward
-            serial_loss = self._dist_context.serial_fetch_vars["loss"][0]
+            serial_loss = self._dist_context.serial_loss
             params_grads = self._generate_backward(
                 serial_main_program, serial_startup_program, serial_loss)
             # Apply pre optimization passes
             self._apply_pre_optimization(serial_main_program,
                                          serial_startup_program, serial_loss,
                                          serial_optimizer, params_grads)
+
             # Do logical partition
             partitioner = Partitioner(self._dist_context, rank)
             dist_main_prog, dist_startup_prog, dist_params_grads = partitioner.partition(
@@ -85,7 +88,6 @@ def parallel(self, rank):
             resharder = Resharder(dist_main_prog, dist_startup_prog, rank,
                                   self._dist_context, [], 1)
             resharder.reshard()
-
         # Clone program for test
         if self._mode != 'train':
             dist_main_prog = dist_main_prog.clone(for_test=True)
diff --git a/python/paddle/distributed/auto_parallel/partitioner.py b/python/paddle/distributed/auto_parallel/partitioner.py
index 91a31dd1b922e..ce686fd6a5683 100644
--- a/python/paddle/distributed/auto_parallel/partitioner.py
+++ b/python/paddle/distributed/auto_parallel/partitioner.py
@@ -211,7 +211,7 @@ def partition_block(self, ref_block, target_block):
         forward_op_id2forward_op = {}
         for idx in range(len(serial_ops)):
             if idx <= last_fwd_op_idx:
-                forward_op_id2forward_op[serial_ops[idx].desc.id(
+                forward_op_id2forward_op[serial_ops[idx].desc.original_id(
                 )] = serial_ops[idx]
 
         appended_grad_times = 0
@@ -408,9 +408,9 @@ def _partition_var(dist_context, src_block, dst_block, src_varname,
 def _get_dist_op_backward_implement(backward_op, dist_context,
                                     forward_op_id2forward_op):
     dist_op_context = dist_context.dist_op_context
-    if backward_op.desc.id() in dist_op_context.grad_op_id_to_op_id:
-        forward_op_id = dist_op_context.grad_op_id_to_op_id[backward_op.desc.id(
-        )]
+    if backward_op.desc.original_id() in dist_op_context.grad_op_id_to_op_id:
+        forward_op_id = dist_op_context.grad_op_id_to_op_id[
+            backward_op.desc.original_id()]
         forward_op = forward_op_id2forward_op[forward_op_id]
         forward_op_dist_attr = dist_context.get_op_dist_attr_for_program(
             forward_op)
diff --git a/python/paddle/distributed/auto_parallel/planner_v2.py b/python/paddle/distributed/auto_parallel/planner_v2.py
index 7db17e98d07ee..3625a25d74e0e 100755
--- a/python/paddle/distributed/auto_parallel/planner_v2.py
+++ b/python/paddle/distributed/auto_parallel/planner_v2.py
@@ -16,6 +16,8 @@
 from .dist_context import get_default_distributed_context
 from .utils import print_program_with_dist_attr
 
+# from .tuner.parallel_tuner import ParallelTuner
+
 
 class Planner:
     def __init__(self, mode, dist_context):
@@ -24,19 +26,28 @@ def __init__(self, mode, dist_context):
 
         # NOTE: [HighOrderGrad]. There are grad ops in forward phase, and it need
         # dependency of backward-forward ops in forward completion.
+        # TODO: The id mapping will be lost if we clone the original program.
         default_ctx = get_default_distributed_context()
         self._dist_context._dist_op_context = default_ctx.dist_op_context
         self._dist_context.initialize()
 
         self._completer = Completer(self._dist_context)
 
+        self._strategy = dist_context.strategy
+        # if self._strategy.auto_search:
+        #     self._parallel_tuner = ParallelTuner(
+        #         self._dist_context, mode=self._mode)
+
     @property
     def completer(self):
         return self._completer
 
     def plan(self):
         self._completer.complete_forward_annotation()
+        # if self._strategy.auto_search:
+        #     self._parallel_tuner.tune()
+        # else:
+        #     self._completer.complete_forward_annotation()
         # parse forward sub block
         self._dist_context.block_state.parse_forward_blocks(
             self._dist_context.serial_main_program)
-        # TODO: add the auto searcher
diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py
index fbe3a43a7917a..42d90b0d4d619 100644
--- a/python/paddle/distributed/auto_parallel/utils.py
+++ b/python/paddle/distributed/auto_parallel/utils.py
@@ -324,10 +324,13 @@ def _get_corresponding_rank(dist_context, target_mesh, rank):
                                                 mesh.processes.index(rank))
             break
 
-    assert coordinate is not None, "could NOT found rank [{}] in any registered mesh".format(
-        rank)
-    return target_mesh.processes[_coordinate2linear_idx(mesh.topology,
-                                                        coordinate)]
+    # assert coordinate is not None, "could NOT found rank [{}] in any registered mesh".format(
+    #     rank)
+    if coordinate is not None:
+        return target_mesh.processes[_coordinate2linear_idx(mesh.topology,
+                                                            coordinate)]
+    else:
+        return target_mesh.processes[0]
 
 
 def _get_unshard_dist_shape(var, dist_attr):
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index cd03e55f25f61..5f481bd0dca41 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -403,6 +403,11 @@ def new_group(ranks=None, backend=None):
         _group_map_by_name[group_name] = group
         _group_map[gid] = group
 
+        # TODO(shenliang03): This is a temporary solution to solve the problem of 
+        # hang caused by tcp
+        tmp = paddle.to_tensor([1], dtype="int32")
+        paddle.distributed.all_reduce(tmp, group=group, use_calc_stream=True)
+        paddle.distributed.wait(tmp)
         return group
 
     if not backend:
diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
index b6698a200e945..de36f8503a651 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
@@ -18,6 +18,7 @@
 import numpy as np
 from paddle import _C_ops
 import paddle.fluid.core as core
+from paddle.fluid.framework import _in_legacy_dygraph, _non_static_mode, in_dygraph_mode
 
 _hcg = None
 _use_cache = False
@@ -148,9 +149,15 @@ def set_send_message(self, tensor):
 
 
 def _is_valid_send_recv_partial(tensor, mp_degree):
-    tensor_numel = np.prod(tensor.shape)
-    assert tensor_numel != 0, "can't send/recv zero element"
-    return mp_degree > 1 and tensor_numel % mp_degree == 0
+
+    if _in_legacy_dygraph():
+        tensor_numel = np.prod(tensor.shape)
+        assert tensor_numel != 0, "can't send/recv zero element"
+        return mp_degree > 1 and tensor_numel % mp_degree == 0
+    elif in_dygraph_mode():
+        # TODO(shenliang03) support mp+pp optimizer in future. 
+        # (partial_send/partial_recv/partial_allgather_)
+        return False
 
 
 def send_partial(tensor,
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
index 049d3ffa3694f..e44b5d2515d83 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
@@ -205,7 +205,7 @@ def _clear_gradients(self):
             for param in list(self._unslice_params):
                 param.clear_gradient(False)
                 tmp_var = param.cuda(DEV_ID)
-                param._clear_data()
+
                 if tmp_var.dtype == Type.fp32.value and param2dtype[
                         param.name] == Type.fp16.value:
                     tmp_var = paddle.cast(tmp_var, Type.fp16.value)
@@ -272,6 +272,8 @@ def _handle_unslice_params(self):
                 master_tensor = paddle.cast(param, Type.fp32.value)
                 master_tensor.name = param.name
                 self._optim._master_weights[param.name] = master_tensor
+            if self._offload:
+                param.master_weight = paddle.cast(param, Type.fp32.value).cpu()
             param2dtype[param.name] = param.dtype
             p_align = self._param2align(param)
             self._unslice_params2align[param.name] = p_align
@@ -369,7 +371,6 @@ def _param_storage(self, param, buffer_size):
         tmp_var.get_tensor().set(param_cpu.get_tensor(), core.CPUPlace())
         del tmp_var
         param.get_tensor()._set_dims(param_shape)
-        param._clear_data()
 
         # Current rank param_storage
         if self._offload:
@@ -379,6 +380,9 @@ def _param_storage(self, param, buffer_size):
                 value=tmp_tensor,
                 place=core.CPUPlace(),
                 name="slice@" + param.name)
+            with device_guard():
+                param.master_weight = paddle.cast(param.fw_storage,
+                                                  Type.fp32.value)
         else:
             param.fw_storage = core.eager.Tensor(
                 value=buffer._slice(start, end), name="slice@" + param.name)
@@ -389,6 +393,7 @@ def _param_storage(self, param, buffer_size):
             master_tensor = paddle.cast(param.fw_storage, Type.fp32.value)
             master_tensor.name = param.name
             self._optim._master_weights[param.fw_storage.name] = master_tensor
+        param._clear_data()
 
     def _register_forward_hooks(self, layer):
         """
@@ -480,9 +485,8 @@ def _update_params(self):
             collective.all_reduce(tensor=grad_storage.buffer, group=self._group)
         if self._offload:
             for param in list(self._unslice_params):
-                tmp_var = _device2cpu(param, convert_dtype=True)
-                tmp_var._share_buffer_to(param)
-                del tmp_var
+                param._clear_data()
+                param.master_weight._share_buffer_to(param)
 
             for grad_storage in self._grad_storages.values():
                 for p in grad_storage._params:
@@ -568,7 +572,8 @@ def allreduce_(*_):
                     del self._task_flow.full_param[param.name]
 
                     if self._offload:
-                        param.fw_storage = _device2cpu(param.fw_storage, True)
+                        param.fw_storage._clear_data()
+                        param.master_weight._share_buffer_to(param.fw_storage)
 
         return allreduce_
 
@@ -856,6 +861,7 @@ def _PartitionParam(param):
     if not hasattr(param, "fw_storage"):
         setattr(param, "fw_storage", None)
         setattr(param, "bw_storage", None)
+        setattr(param, "master_weight", None)
         setattr(param, "status", "all")
         setattr(param, "use_count", 0)
     return param
@@ -864,6 +870,7 @@ def _PartitionParam(param):
 def _UnsliceParam(param):
     if not hasattr(param, "unslice"):
         setattr(param, "unslice", True)
+        setattr(param, "master_weight", None)
     return param
 
 
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
index f96273cc84caf..7bb1517f12169 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
@@ -199,7 +199,7 @@ def _clear_gradients(self):
                 param.clear_gradient(False)
                 param._gradient_set_empty(False)
                 tmp_var = param.cuda(DEV_ID)
-                param._clear()
+
                 if tmp_var.dtype == Type.fp32.value and param2dtype[
                         param.name] == Type.fp16.value:
                     tmp_var = paddle.cast(tmp_var, Type.fp16.value)
@@ -220,19 +220,14 @@ def _update_params_slice(self):
             self._optim._param_groups = slice_params + list(
                 self._unslice_params)
         else:
-            params_name_list = list(map(lambda p: p.name, update_list))
-            fw_storage_name_list = list(
-                map(lambda p: p.fw_storage.name, update_list))
             for param_group in self._optim._param_groups:
                 p_group = []
                 for p in param_group['params']:
-                    if p.name in params_name_list:
+                    if hasattr(p, "fw_storage"):
                         p_group.append(p.fw_storage)
-                    elif p.name in fw_storage_name_list:
-                        p_group.append(update_list[fw_storage_name_list.index(
-                            p.name)].fw_storage)
-                    elif p in self._unslice_params:
+                    else:
                         p_group.append(p)
+
                 param_group['params'] = p_group
 
     def forward(self, *inputs, **kwargs):
@@ -268,6 +263,8 @@ def _handle_unslice_params(self):
             if param.dtype == Type.fp16.value and not self._offload:
                 self._optim._master_weights[param.name] = paddle.cast(
                     param, Type.fp32.value)
+            if self._offload:
+                param.master_weight = paddle.cast(param, Type.fp32.value).cpu()
             param2dtype[param.name] = param.dtype
             p_align = self._param2align(param)
             self._unslice_params2align[param.name] = p_align
@@ -335,11 +332,12 @@ def _add_manage_info(trainable_param):
                 self._param2buffer[param.name].append(
                     (rank_ * pre_buffer, (rank_ + 1) * pre_buffer))
 
-            # 3.Flatten layer params and release other rank buffer
-            self._param_storage(param, buffer_size)
             # Record param's dtype
             param2dtype[param.name] = param.dtype
 
+            # 3.Flatten layer params and release other rank buffer
+            self._param_storage(param, buffer_size)
+
     def _param_storage(self, param, buffer_size):
         """
         This is a function to simplify the handling of parameter InternalStorages.
@@ -365,13 +363,15 @@ def _param_storage(self, param, buffer_size):
         tmp_var.value().get_tensor().set(param_cpu.value().get_tensor(),
                                          core.CPUPlace())
         param.value().get_tensor()._set_dims(param_shape)
-        param._clear()
 
         # Current rank param_storage
         if self._offload:
             param.fw_storage = core.VarBase(
                 buffer._slice(start, end),
                 core.CPUPlace(), "slice@" + param.name)
+            with device_guard(device="cpu"):
+                param.master_weight = paddle.cast(param.fw_storage,
+                                                  Type.fp32.value)
         else:
             param.fw_storage = core.VarBase(
                 buffer._slice(start, end), "slice@" + param.name)
@@ -381,6 +381,7 @@ def _param_storage(self, param, buffer_size):
         if param.dtype == Type.fp16.value and not self._offload:
             self._optim._master_weights[param.fw_storage.name] = paddle.cast(
                 param.fw_storage, Type.fp32.value)
+        param._clear()
 
     def _register_forward_hooks(self, layer):
         """
@@ -482,9 +483,8 @@ def _update_params(self):
 
         if self._offload:
             for param in list(self._unslice_params):
-                tmp_var = _device2cpu(param, convert_dtype=True)
-                tmp_var._share_buffer_to(param)
-                tmp_var._clear()
+                param._clear()
+                param.master_weight._share_buffer_to(param)
 
             for grad_storage in self._grad_storages.values():
                 for p in grad_storage._params:
@@ -553,8 +553,9 @@ def allreduce_(*_):
                         cpu_grad = _device2cpu(
                             core.VarBase(full_grad._slice(start, end))
                             .detach().clone(), True)
-                        param.bw_storage = paddle.add(param.bw_storage,
-                                                      cpu_grad)
+                        with device_guard(device="cpu"):
+                            param.bw_storage = paddle.add(param.bw_storage,
+                                                          cpu_grad)
                     else:
                         # param.bw_storage.add_(
                         #     core.VarBase(full_grad._slice(start, end))
@@ -581,7 +582,8 @@ def allreduce_(*_):
                     tmp_var._clear()
 
                     if self._offload:
-                        param.fw_storage = _device2cpu(param.fw_storage, True)
+                        param.fw_storage._clear()
+                        param.master_weight._share_buffer_to(param.fw_storage)
 
         return allreduce_
 
@@ -869,6 +871,7 @@ def _PartitionParam(param):
     if not hasattr(param, "fw_storage"):
         setattr(param, "fw_storage", None)
         setattr(param, "bw_storage", None)
+        setattr(param, "master_weight", None)
         setattr(param, "status", "all")
         setattr(param, "use_count", 0)
     return param
@@ -877,6 +880,7 @@ def _PartitionParam(param):
 def _UnsliceParam(param):
     if not hasattr(param, "unslice"):
         setattr(param, "unslice", True)
+        setattr(param, "master_weight", None)
     return param
 
 
diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
index 1285e1f3323ff..5e2ad43c16431 100644
--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
@@ -140,17 +140,12 @@ def broadcast_dp_parameters(model, hcg):
 
 
 def fused_allreduce_gradients(parameter_list, hcg):
-    if _in_legacy_dygraph():
-        data_parallel_group = None if hcg is None else hcg.get_data_parallel_group(
-        )
-        logger.debug("dp start fuse allreduce gradients")
-        with framework.no_grad():
-            _apply_collective_grads(parameter_list, data_parallel_group)
-    elif in_dygraph_mode():
-        assert hcg is None, "It's not support to use hcg in EagerDygraph now."
-        data_parallel_group = paddle.distributed.collective._get_default_group()
-        with framework.no_grad():
-            _apply_collective_grads_eager(parameter_list, data_parallel_group)
+    data_parallel_group = None if hcg is None else hcg.get_data_parallel_group()
+    logger.debug("dp start fuse allreduce gradients")
+    apply_func = _apply_collective_grads_eager if in_dygraph_mode(
+    ) else _apply_collective_grads
+    with framework.no_grad():
+        apply_func(parameter_list, data_parallel_group)
 
 
 def sharding_reduce_gradients(parameter_list, hcg):
@@ -162,29 +157,36 @@ def sharding_reduce_gradients(parameter_list, hcg):
         sharding_nrank = hcg.get_sharding_parallel_group().nranks
         for param in parameter_list:
             if param.trainable and (param._grad_ivar() is not None):
-
-                g_var = param._grad_ivar()
-
-                # need use trace_op to allreduce 
-                # paddle.distributed.all_reduce(
-                #     g_var, group=hcg.get_sharding_parallel_group(), use_calc_stream=True)
-                paddle.fluid.framework._dygraph_tracer().trace_op(
-                    type="c_allreduce_sum",
-                    inputs={'X': g_var},
-                    outputs={'Out': g_var},
-                    attrs={
-                        'ring_id': hcg.get_sharding_parallel_group().id,
-                        'use_calc_stream': True
-                    })
-
-                # grad / sharding_rank
-                div_factor = paddle.to_tensor(sharding_nrank, dtype=g_var.dtype)
-                paddle.fluid.framework._dygraph_tracer().trace_op(
-                    type="elementwise_div",
-                    inputs={'X': g_var,
-                            'Y': div_factor},
-                    outputs={'Out': g_var},
-                    attrs={'axis': -1})
+                if in_dygraph_mode():
+                    param.grad.scale_(1.0 / sharding_nrank)
+                    paddle.distributed.all_reduce(
+                        param.grad,
+                        group=hcg.get_sharding_parallel_group(),
+                        use_calc_stream=True)
+
+                elif _in_legacy_dygraph():
+                    g_var = param._grad_ivar()
+                    # need use trace_op to allreduce 
+                    # paddle.distributed.all_reduce(
+                    #     g_var, group=hcg.get_sharding_parallel_group(), use_calc_stream=True)
+                    paddle.fluid.framework._dygraph_tracer().trace_op(
+                        type="c_allreduce_sum",
+                        inputs={'X': g_var},
+                        outputs={'Out': g_var},
+                        attrs={
+                            'ring_id': hcg.get_sharding_parallel_group().id,
+                            'use_calc_stream': True
+                        })
+
+                    # grad / sharding_rank
+                    div_factor = paddle.to_tensor(
+                        sharding_nrank, dtype=g_var.dtype)
+                    paddle.fluid.framework._dygraph_tracer().trace_op(
+                        type="elementwise_div",
+                        inputs={'X': g_var,
+                                'Y': div_factor},
+                        outputs={'Out': g_var},
+                        attrs={'axis': -1})
 
 
 def broadcast_sharding_parameters(model, hcg):
diff --git a/python/paddle/distributed/launch/context/__init__.py b/python/paddle/distributed/launch/context/__init__.py
index 08c8f0835c5e1..fbea5d0db869e 100644
--- a/python/paddle/distributed/launch/context/__init__.py
+++ b/python/paddle/distributed/launch/context/__init__.py
@@ -17,6 +17,7 @@
 from .node import Node
 from .status import Status
 from .args_envs import parse_args, fetch_envs, env_args_mapping
+import six
 
 import logging
 
@@ -39,6 +40,12 @@ def __init__(self, enable_plugin=True):
         if enable_plugin:
             self._enable_plugin()
 
+    def print(self):
+        self.logger.info("-----------  Configuration  ----------------------")
+        for arg, value in sorted(six.iteritems(vars(self.args))):
+            self.logger.info("%s: %s" % (arg, value))
+        self.logger.info("--------------------------------------------------")
+
     def is_legacy_mode(self):
         if self.args.legacy:
             return True
diff --git a/python/paddle/distributed/launch/context/args_envs.py b/python/paddle/distributed/launch/context/args_envs.py
index b624281e44db3..b70dd7d3f759f 100644
--- a/python/paddle/distributed/launch/context/args_envs.py
+++ b/python/paddle/distributed/launch/context/args_envs.py
@@ -35,6 +35,7 @@
     'PADDLE_TRAINERS_ENDPOINTS': 'trainers',
     'PADDLE_GLOO_PORT': 'gloo_port',
     'PADDLE_WITH_GLOO': 'with_gloo',
+    'PADDLE_DEVICE_NUM': 'device_num'
 }
 
 
@@ -85,7 +86,7 @@ def parse_args():
     base_group.add_argument(
         "--run_mode",
         type=str,
-        default="collective",
+        default=None,
         help="run mode of the job, collective/ps/ps-heter")
 
     base_group.add_argument(
@@ -100,6 +101,12 @@ def parse_args():
         default=None,
         help="accelerate devices. as --gpus,npus,xps")
 
+    base_group.add_argument(
+        "--device_num",
+        type=int,
+        default=None,
+        help="the number of accelerate devices.")
+
     base_group.add_argument("--host", type=str, default=None, help="host ip")
 
     base_group.add_argument(
@@ -125,7 +132,7 @@ def parse_args():
     ps_group.add_argument(
         "--gloo_port", type=int, default=6767, help="gloo http port")
     ps_group.add_argument(
-        "--with_gloo", type=str, default="0", help="use gloo or not")
+        "--with_gloo", type=str, default="1", help="use gloo or not")
 
     # parameter elastic mode
     elastic_group = parser.add_argument_group("Elastic Parameters")
diff --git a/python/paddle/distributed/launch/context/device.py b/python/paddle/distributed/launch/context/device.py
index 30b8cc1538590..61ffe8e809564 100644
--- a/python/paddle/distributed/launch/context/device.py
+++ b/python/paddle/distributed/launch/context/device.py
@@ -21,6 +21,7 @@ class DeviceType:
     XPU = 'xpu'
     NPU = 'npu'
     MLU = 'mlu'
+    IPU = 'ipu'
 
 
 class Device(object):
@@ -68,12 +69,18 @@ def get_selected_device_key(self):
             return 'FLAGS_selected_xpus'
         if self._dtype == DeviceType.MLU:
             return 'FLAGS_selected_mlus'
+        if self._dtype == DeviceType.IPU:
+            return 'FLAGS_selected_ipus'
         return 'FLAGS_selected_devices'
 
-    def get_selected_devices(self, devices=''):
+    def get_selected_devices(self, devices='', device_num=None):
         '''
         return the device label/id relative to the visible devices
         '''
+        if self._dtype == DeviceType.IPU:
+            if not device_num:
+                raise RuntimeError("The \'device_num\' is required by IPUs.")
+            return [str(device_num)]
         if not devices:
             return [str(x) for x in range(0, len(self._labels))]
         else:
@@ -129,6 +136,9 @@ def detect_device(self):
             dev._dtype = DeviceType.MLU
             num = fluid.core.get_mlu_device_count()
             visible_devices = os.getenv("MLU_VISIBLE_DEVICES")
+        elif fluid.core.is_compiled_with_ipu():
+            dev._dtype = DeviceType.IPU
+            num = fluid.core.get_ipu_device_count()
 
         if num == 0:
             dev._dtype = DeviceType.CPU
diff --git a/python/paddle/distributed/launch/controllers/__init__.py b/python/paddle/distributed/launch/controllers/__init__.py
index 706131300f0d8..f1c6ea5399a46 100644
--- a/python/paddle/distributed/launch/controllers/__init__.py
+++ b/python/paddle/distributed/launch/controllers/__init__.py
@@ -29,4 +29,5 @@
 def init(ctx):
     for c in _controllers:
         if c.enable(ctx):
+            ctx.print()
             return c(ctx)
diff --git a/python/paddle/distributed/launch/controllers/collective.py b/python/paddle/distributed/launch/controllers/collective.py
index 3763bac041451..166eb3a4f9dfd 100644
--- a/python/paddle/distributed/launch/controllers/collective.py
+++ b/python/paddle/distributed/launch/controllers/collective.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .controller import Controller
+from .controller import Controller, ControleMode
 
 import json
 import os
@@ -23,8 +23,10 @@
 class CollectiveController(Controller):
     @classmethod
     def enable(cls, ctx):
+        # collective is the default mode
         if ctx:
             ctx.logger.debug("{} enabled".format(cls.__name__))
+            ctx.args.run_mode = ControleMode.COLLECTIVE
             return True
         else:
             return False
@@ -77,7 +79,8 @@ def build_pod(self):
         self.pod.reset()
         selected_dev_key = self.ctx.node.device.get_selected_device_key()
         selected_dev_list = self.ctx.node.device.get_selected_devices(
-            self.ctx.args.devices)
+            self.ctx.args.devices, self.ctx.args.device_num)
+
         for i in range(self.pod.replicas):
             e = {
                 "PADDLE_MASTER": collective_master,
@@ -85,6 +88,7 @@ def build_pod(self):
                 "PADDLE_LOCAL_SIZE": "{}".format(self.pod.replicas),
                 "PADDLE_GLOBAL_RANK": "{}".format(i + rank_offset),
                 "PADDLE_LOCAL_RANK": "{}".format(i),
+                "PADDLE_NNODES": "{}".format(self.job.replicas),
                 ## compatible env
                 "PADDLE_TRAINER_ENDPOINTS": ",".join(job_endpoints),
                 "PADDLE_CURRENT_ENDPOINT": endpoints[i],
@@ -92,7 +96,8 @@ def build_pod(self):
                 "PADDLE_TRAINERS_NUM": "{}".format(global_size),
                 "PADDLE_RANK_IN_NODE": str(i),
             }
-            if self.pod.replicas == 1:
+
+            if self.pod.replicas == 1 or self.ctx.node.device.dtype == "ipu":
                 e.update({selected_dev_key: ",".join(selected_dev_list)})
             else:
                 e.update({selected_dev_key: selected_dev_list[i]})
@@ -106,6 +111,7 @@ class CollectiveElasticController(CollectiveController):
     def enable(cls, ctx):
         if ctx.args.master and ctx.args.master.startswith("etcd://"):
             ctx.logger.debug("{} enabled".format(cls.__name__))
+            ctx.args.run_mode = ControleMode.COLLECTIVE
             return True
         else:
             return False
diff --git a/python/paddle/distributed/launch/controllers/master.py b/python/paddle/distributed/launch/controllers/master.py
index 43eda4cdffa24..742fea9e16de7 100644
--- a/python/paddle/distributed/launch/controllers/master.py
+++ b/python/paddle/distributed/launch/controllers/master.py
@@ -276,10 +276,20 @@ def fetch_peer_alive(self):
         return peer_alive
 
     def wait_peer_ready(self, replicas_min, replicas_max, timeout):
+        timeout = timeout if timeout > 1 else 3
+
         end = time.time() + timeout
+        np_pre = len(self.fetch_peer_alive())
         while not self.ctx.status.is_done() and time.time() < end:
-            if len(self.fetch_peer_alive()) == replicas_max:
+            np = len(self.fetch_peer_alive())
+            if np == replicas_max:
+                # maximum replicas reached, return immediately
                 return (True, replicas_max)
+            elif np != np_pre:
+                # replicas are changing, reset timeout
+                end = time.time() + timeout
+                np_pre = np
+                time.sleep(0.2)
             else:
                 time.sleep(0.5)
 
diff --git a/python/paddle/distributed/launch/controllers/ps.py b/python/paddle/distributed/launch/controllers/ps.py
index 6504f1240ee09..037bd313bbc03 100644
--- a/python/paddle/distributed/launch/controllers/ps.py
+++ b/python/paddle/distributed/launch/controllers/ps.py
@@ -171,6 +171,7 @@ def _build_pod_with_master(self):
 
         for i in range(server_num):
             e = {
+                "PADDLE_NNODES": "{}".format(self.job.replicas),
                 "PADDLE_PSERVERS_IP_PORT_LIST": ",".join(server_endpoints),
                 "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints),
                 "PADDLE_PORT":
@@ -186,6 +187,7 @@ def _build_pod_with_master(self):
 
         for i in range(trainer_num):
             e = {
+                "PADDLE_NNODES": "{}".format(self.job.replicas),
                 "PADDLE_PSERVERS_IP_PORT_LIST": ",".join(server_endpoints),
                 "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints),
                 "PADDLE_PORT":
diff --git a/python/paddle/distributed/launch/main.py b/python/paddle/distributed/launch/main.py
index b2c87e737c82d..92585c9e7657a 100644
--- a/python/paddle/distributed/launch/main.py
+++ b/python/paddle/distributed/launch/main.py
@@ -52,7 +52,9 @@ def launch():
 
         - ``--job_id``: The job unique id, it affects the log files' name. e.g., ``--job_id=job1``. Default ``--job_id=default``.
 
-        - ``--devices``: The selected accelerate devices on nodes, can be gpu/xpu/npu/mlu etc.. e.g., ``--devices=0,1,2,3`` will launch four training processes each bound to one device.
+        - ``--devices``: The selected accelerate devices on nodes, can be gpu/xpu/npu/mlu/ipu etc.. e.g., ``--devices=0,1,2,3`` will launch four training processes each bound to one device.
+
+        - ``--device_num``: The number of selected accelerate devices on nodes, can be gpu/xpu/npu/mlu/ipu etc.. e.g., ``--device_num=4`` will require four devices per node.
 
         - ``training_script``: The full path to the single GPU training program/script to be launched in parallel, followed by all the arguments for the training script. e.g., ``training.py``
 
diff --git a/python/paddle/distributed/launch/plugins/__init__.py b/python/paddle/distributed/launch/plugins/__init__.py
index 35a44ed942c20..faa8f2823733c 100644
--- a/python/paddle/distributed/launch/plugins/__init__.py
+++ b/python/paddle/distributed/launch/plugins/__init__.py
@@ -17,6 +17,7 @@
 __all__ = []
 
 
+# print configuration after args are well filled in controller init
 def log(ctx):
     ctx.logger.info("-----------  Configuration  ----------------------")
     for arg, value in sorted(six.iteritems(vars(ctx.args))):
@@ -24,6 +25,20 @@ def log(ctx):
     ctx.logger.info("--------------------------------------------------")
 
 
+def rewrite_ipu_script(ctx):
+    import paddle.fluid as fluid
+    if fluid.core.is_compiled_with_ipu():
+        import os
+        if ctx.args.training_script != "ipu":
+            raise RuntimeError(
+                "Only support to run the script \'ipu\' for IPU distributed computing."
+            )
+        ctx.args.training_script = os.path.abspath(
+            os.path.join(
+                os.path.dirname(os.path.dirname(__file__)),
+                "utils/ipu_launch.py"))
+
+
 def process_args(ctx):
     # reset device by args
     #argdev = ctx.args.gpus or ctx.args.xpus or ctx.args.npus
@@ -59,4 +74,6 @@ def rewrite_host_ip(ctx):
         ctx.node.ip = ctx.args.host
 
 
-enabled_plugins = [collective_compatible, rewrite_host_ip, process_args, log]
+enabled_plugins = [
+    collective_compatible, rewrite_host_ip, process_args, rewrite_ipu_script
+]
diff --git a/python/paddle/distributed/launch/utils/ipu_launch.py b/python/paddle/distributed/launch/utils/ipu_launch.py
new file mode 100644
index 0000000000000..595243cdf9d9c
--- /dev/null
+++ b/python/paddle/distributed/launch/utils/ipu_launch.py
@@ -0,0 +1,167 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+
+import subprocess
+import argparse
+import os
+import logging
+import sys
+
+
+class IPULaunch(object):
+    def __init__(self, hosts, ipus_per_replica, nproc_per_host, ipu_partition,
+                 vipu_server, training_script, training_script_args):
+        if not fluid.core.is_compiled_with_ipu():
+            raise RuntimeError(
+                "Can not call ipu_launch.py in non IPU compiled environment, please re-compile with WITH_IPU=ON."
+            )
+        self._hosts = hosts
+        self._ipus_per_replica = ipus_per_replica
+        self._nproc_per_host = nproc_per_host
+        self._ipu_partition = ipu_partition
+        self._vipu_server = vipu_server
+        self._training_script = training_script
+        self._training_script_args = training_script_args
+
+        self._num_ipus = int(os.getenv("FLAGS_selected_ipus"))
+        self.logger = self.get_logger()
+
+    @classmethod
+    def parse_ipu_args(self):
+        parser = argparse.ArgumentParser()
+        parser.add_argument(
+            "--hosts",
+            type=str,
+            help="The hosts for IPU PopRun distributd computing.")
+        parser.add_argument(
+            "--ipus_per_replica",
+            type=int,
+            help="The number of IPUs per replica.")
+        parser.add_argument(
+            "--nproc_per_host",
+            type=int,
+            help="The number of processes per host.")
+        parser.add_argument(
+            "--ipu_partition", type=str, help="The partition name of IPU.")
+        parser.add_argument(
+            "--vipu_server",
+            type=str,
+            help="The vipu server host to enable vipu.")
+        parser.add_argument(
+            "training_script",
+            type=str,
+            help="The full path to the single IPU replica training program/script to be launched in parallel."
+        )
+        parser.add_argument('training_script_args', nargs=argparse.REMAINDER)
+        args = parser.parse_args()
+
+        ipu_launch = IPULaunch(
+            hosts=args.hosts,
+            ipus_per_replica=args.ipus_per_replica,
+            nproc_per_host=args.nproc_per_host,
+            ipu_partition=args.ipu_partition,
+            vipu_server=args.vipu_server,
+            training_script=args.training_script,
+            training_script_args=args.training_script_args, )
+
+        return ipu_launch
+
+    def get_logger(self, level=logging.INFO):
+        logger = logging.getLogger("LAUNCH")
+        logger.setLevel(level)
+        formatter = logging.Formatter(
+            fmt='%(name)s %(levelname)s %(asctime)s %(message)s')
+        ch = logging.StreamHandler()
+        ch.setFormatter(formatter)
+        logger.addHandler(ch)
+        return logger
+
+    def launch(self):
+        # The number of replicas for data parallel
+        assert (self._num_ipus % self._ipus_per_replica) == 0, \
+                    "The number of IPUs:{} mod the number of IPUs per replica:{} must == 0".format(self._num_ipus, self._ipus_per_replica)
+        num_replicas = self._num_ipus // self._ipus_per_replica
+        self.logger.info("The number of total replicas is {}.".format(
+            num_replicas))
+
+        # The number of processes
+        num_nodes = len(self._hosts.split(','))
+        num_procs = num_nodes * self._nproc_per_host
+        self.logger.info("The number of total processes is {}.".format(
+            num_procs))
+        assert (num_replicas % num_procs) == 0, \
+                    "The number of replicas:{} mod the number of processes:{} must == 0".format(num_replicas, num_procs)
+
+        # hosts and endpoints
+        hosts = self._hosts.replace(' ', '').split(',')
+        endpoints = [x + ":8090" for x in hosts]
+
+        # args for poprun
+        poprun_command = ['poprun']
+
+        poprun_command.append('--num-instances={}'.format(num_procs))
+        poprun_command.append('--num-replicas={}'.format(num_replicas))
+        poprun_command.append('--ipus-per-replica={}'.format(
+            self._ipus_per_replica))
+        poprun_command.append('--host={}'.format(','.join(hosts)))
+        poprun_command.append('--vipu-partition={}'.format(self._ipu_partition))
+        poprun_command.append('--vipu-server-host={}'.format(self._vipu_server))
+
+        poprun_command.extend([
+            '--update-partition=no', '--vipu-server-timeout=120',
+            '--print-topology=yes', '--numa-aware=yes'
+        ])
+
+        # global envs
+        global_envs = '--mpi-local-args=\''
+        log_level = os.getenv('POPART_LOG_LEVEL', None)
+        if log_level:
+            global_envs += '-x POPART_LOG_LEVEL={} '.format(log_level)
+        global_envs += '-x PADDLE_TRAINERS_NUM={} -x PADDLE_TRAINER_ENDPOINTS={}'.format(
+            num_procs, ','.join(endpoints))
+        global_envs += '\''
+        poprun_command.append(global_envs)
+
+        # local envs
+        for idx in range(num_procs):
+            cur_endpoint = endpoints[idx // self._nproc_per_host]
+            rank_in_node = idx % self._nproc_per_host
+            poprun_command.append(
+                '--instance-mpi-local-args={}:\"-x PADDLE_TRAINER_ID={} -x PADDLE_CURRENT_ENDPOINT={} -x PADDLE_RANK_IN_NODE={}\"'.
+                format(idx, idx, cur_endpoint, rank_in_node))
+
+        # executor
+        poprun_command.append(sys.executable)
+
+        # script and script args
+        poprun_command.append(self._training_script)
+        for arg in self._training_script_args:
+            poprun_command.append(arg)
+
+        # for debug
+        print("-----------  PopRun Command -----------")
+        for i in range(len(poprun_command) - 1):
+            print("%s \\" % (poprun_command[i]))
+        print("%s" % (poprun_command[len(poprun_command) - 1]))
+        print("---------------------------------------")
+
+        # Launch
+        subprocess.run(" ".join(poprun_command), shell=True)
+
+
+if __name__ == '__main__':
+    ipu_launch = IPULaunch.parse_ipu_args()
+    ipu_launch.launch()
diff --git a/python/paddle/distributed/passes/auto_parallel_amp.py b/python/paddle/distributed/passes/auto_parallel_amp.py
index fe94c25e12d2d..3cd04affa29c2 100644
--- a/python/paddle/distributed/passes/auto_parallel_amp.py
+++ b/python/paddle/distributed/passes/auto_parallel_amp.py
@@ -46,13 +46,13 @@ def _build_stats(self, amp_lists, dist_context):
             if int(op.attr('op_role')) == int(OpRole.Forward):
                 self._mark_black_white_ops(amp_lists)
             elif int(op.attr('op_role')) == int(OpRole.Backward):
-                if op.desc.id() in dist_op_context.grad_op_id_to_op_id:
-                    fwd_op_id = dist_op_context.grad_op_id_to_op_id[op.desc.id(
-                    )]
+                if op.desc.original_id() in dist_op_context.grad_op_id_to_op_id:
+                    fwd_op_id = dist_op_context.grad_op_id_to_op_id[
+                        op.desc.original_id()]
                     if self._is_fp16_op(fwd_op_id) == True:
-                        self._op_fp16_dict[op.desc.id()] = True
+                        self._op_fp16_dict[op.desc.original_id()] = True
                     elif self._is_fp16_op(fwd_op_id) == False:
-                        self._op_fp16_dict[op.desc.id()] = False
+                        self._op_fp16_dict[op.desc.original_id()] = False
             elif int(op.attr('op_role')) == int(OpRole.Optimize):
                 break
 
@@ -70,12 +70,12 @@ def _mark_black_white_ops(self, amp_lists):
                 continue
             if amp_lists.black_varnames is not None and _is_in_black_varnames(
                     op, amp_lists):
-                self._op_fp16_dict[op.desc.id()] = False
+                self._op_fp16_dict[op.desc.original_id()] = False
                 continue
             if op.type in amp_lists.black_list:
-                self._op_fp16_dict[op.desc.id()] = False
+                self._op_fp16_dict[op.desc.original_id()] = False
             elif op.type in amp_lists.white_list:
-                self._op_fp16_dict[op.desc.id()] = True
+                self._op_fp16_dict[op.desc.original_id()] = True
             elif op.type in amp_lists.gray_list:
                 is_black_op = False
                 is_white_op = False
@@ -95,22 +95,22 @@ def _mark_black_white_ops(self, amp_lists):
                             else:
                                 prev_op = in_var.op
                             # if it's one of inputs
-                            if self._is_fp16_op(prev_op.desc.id()) == False or \
+                            if self._is_fp16_op(prev_op.desc.original_id()) == False or \
                                     prev_op.type in amp_lists.black_list:
                                 is_black_op = True
-                            elif self._is_fp16_op(prev_op.desc.id()) == True or \
+                            elif self._is_fp16_op(prev_op.desc.original_id()) == True or \
                                     prev_op.type in amp_lists.white_list:
                                 is_white_op = True
                 if is_black_op:
-                    self._op_fp16_dict[op.desc.id()] = False
+                    self._op_fp16_dict[op.desc.original_id()] = False
                 elif is_white_op:
-                    self._op_fp16_dict[op.desc.id()] = True
+                    self._op_fp16_dict[op.desc.original_id()] = True
                 else:
                     pass
             else:
                 # For numerical safe, we apply fp32 computation on ops that
                 # are not determined which list they should stay.
-                self._op_fp16_dict[op.desc.id()] = False
+                self._op_fp16_dict[op.desc.original_id()] = False
 
     def cast_forward_program(self, dist_context):
         ops = self._block.ops
@@ -120,11 +120,11 @@ def cast_forward_program(self, dist_context):
             num_cast_ops = 0
             if int(op.attr('op_role')) == int(OpRole.Backward):
                 break
-            if self._is_fp16_op(op.desc.id()) == False:
+            if self._is_fp16_op(op.desc.original_id()) == False:
                 num_cast_ops = self._insert_cast_op_forward(
                     op, idx, core.VarDesc.VarType.FP16,
                     core.VarDesc.VarType.FP32, dist_context)
-            elif self._is_fp16_op(op.desc.id()) == True:
+            elif self._is_fp16_op(op.desc.original_id()) == True:
                 num_cast_ops = self._insert_cast_op_forward(
                     op, idx, core.VarDesc.VarType.FP32,
                     core.VarDesc.VarType.FP16, dist_context)
@@ -198,7 +198,7 @@ def _insert_cast_op_forward(self, op, idx, src_dtype, dst_dtype,
                 else:
                     if op.has_attr('in_dtype'):
                         op._set_attr('in_dtype', dst_dtype)
-        self._var_name_dict[op.desc.id()] = var_name_dict
+        self._var_name_dict[op.desc.original_id()] = var_name_dict
 
         if src_dtype == core.VarDesc.VarType.FP32 and dst_dtype == core.VarDesc.VarType.FP16:
             for out_name in op.output_names:
@@ -225,13 +225,14 @@ def cast_backward_program(self, params_grads, dist_context):
         while idx < len(ops):
             num_cast_ops = 0
             grad_op = ops[idx]
+            grad_op_orig_id = grad_op.desc.original_id()
             dist_op_context = dist_context.dist_op_context
-            if grad_op.desc.id() in dist_op_context.grad_op_id_to_op_id:
-                if self._is_fp16_op(grad_op.desc.id()) == False:  # fp32
+            if grad_op_orig_id in dist_op_context.grad_op_id_to_op_id:
+                if self._is_fp16_op(grad_op_orig_id) == False:  # fp32
                     num_cast_ops = self._insert_cast_op_backward(
                         grad_op, idx, core.VarDesc.VarType.FP16,
                         core.VarDesc.VarType.FP32, dist_context)
-                elif self._is_fp16_op(grad_op.desc.id()) == True:  # fp16
+                elif self._is_fp16_op(grad_op_orig_id) == True:  # fp16
                     num_cast_ops = self._insert_cast_op_backward(
                         grad_op, idx, core.VarDesc.VarType.FP32,
                         core.VarDesc.VarType.FP16, dist_context)
@@ -272,8 +273,9 @@ def _keep_fp32_output(op, out_name):
             return False
 
         num_cast_ops = 0
+        original_id = grad_op.desc.original_id()
         dist_op_context = dist_context.dist_op_context
-        fwd_op_id = dist_op_context.grad_op_id_to_op_id[grad_op.desc.id()]
+        fwd_op_id = dist_op_context.grad_op_id_to_op_id[original_id]
 
         for in_name in grad_op.input_names:
             if src_dtype == core.VarDesc.VarType.FP32 and _keep_fp32_input(
diff --git a/python/paddle/distributed/passes/auto_parallel_fp16.py b/python/paddle/distributed/passes/auto_parallel_fp16.py
index 9dda310e5c022..b01f3975aefdd 100644
--- a/python/paddle/distributed/passes/auto_parallel_fp16.py
+++ b/python/paddle/distributed/passes/auto_parallel_fp16.py
@@ -153,23 +153,24 @@ def _mark_op(self, op):
 
             # ernie inference trick
             if op.type == "assign" and "array_" in op.input_arg_names[0]:
-                self._op_fp16_dict[op.desc.id()] = False
+                self._op_fp16_dict[op.desc.original_id()] = False
                 return
             if _need_keep_fp32(op, self.amp_list.unsupported_list,
                                self.use_fp16_guard):
-                self._op_fp16_dict[op.desc.id()] = False
+                self._op_fp16_dict[op.desc.original_id()] = False
             else:
-                self._op_fp16_dict[op.desc.id()] = True
+                self._op_fp16_dict[op.desc.original_id()] = True
             for var_name in op.output_arg_names:
                 # assert var_name not in self.forward_non_leaf_tensors, "{}".format(var_name)
                 self.forward_non_leaf_tensors[var_name] = op.desc.id()
 
         elif is_backward_op(op) == int(OpRole.Backward):
 
-            if op.desc.id() in self.grad_op_to_op_map:
-                fwd_op_id = self.grad_op_to_op_map[op.desc.id()]
+            if op.desc.original_id() in self.grad_op_to_op_map:
+                fwd_op_id = self.grad_op_to_op_map[op.desc.original_id()]
                 assert fwd_op_id in self._op_fp16_dict, "{}".format(str(op))
-                self._op_fp16_dict[op.desc.id()] = self._op_fp16_dict[fwd_op_id]
+                self._op_fp16_dict[op.desc.original_id()] = self._op_fp16_dict[
+                    fwd_op_id]
 
         if int(op.attr('op_role')) == 257:
             self.is_train = True
@@ -192,10 +193,10 @@ def set_var_to_fp16(self, var_name, block):
     def resolute_tensor_dtype(self, block):
 
         for op in block.ops:
-            op_id = op.desc.id()
             if is_forward_op(op):
                 # NOTE (JZ-LIANG) un-expected cast op when user call "+, -, *, /" in python
-                if self._is_fp16_op(op_id) == True or op.type == "cast":
+                if self._is_fp16_op(op.desc.original_id()) == True \
+                    or op.type == "cast":
                     for in_name in op.input_names:
                         if _keep_fp32_input(op, in_name):
                             continue
@@ -209,7 +210,7 @@ def resolute_tensor_dtype(self, block):
                             self.set_var_to_fp16(out_var_name, block)
                     set_op_dtype_to_fp16(op)
                 # NOTE (JZ-LIANG) un-expected cast op when user call "+, -, *, /" in python
-                elif self._is_fp16_op(op_id) == False:
+                elif self._is_fp16_op(op.desc.original_id()) == False:
                     for out_var_name in op.output_arg_names:
                         out_var = block.vars.get(out_var_name)
                         if out_var is None or out_var.type not in _valid_types:
@@ -217,7 +218,7 @@ def resolute_tensor_dtype(self, block):
                         if out_var.dtype == core.VarDesc.VarType.FP16:
                             out_var.desc.set_dtype(core.VarDesc.VarType.FP32)
             elif is_backward_op(op):
-                if self._is_fp16_op(op_id) == True:
+                if self._is_fp16_op(op.desc.original_id()) == True:
                     for out_name in op.output_names:
                         if _keep_fp32_output(op, out_name):
                             continue
@@ -225,7 +226,7 @@ def resolute_tensor_dtype(self, block):
                             self.set_var_to_fp16(out_var_name, block)
                     set_op_dtype_to_fp16(op)
                 # NOTE (JZ-LIANG) un-expected cast op when user call "+, -, *, /" in python
-                elif self._is_fp16_op(op_id) == False:
+                elif self._is_fp16_op(op.desc.original_id()) == False:
                     for out_var_name in op.output_arg_names:
                         out_var = block.vars.get(out_var_name)
                         if out_var is None or out_var.type not in _valid_types:
@@ -238,28 +239,27 @@ def cast_block(self, block):
         idx = 0
         while idx < len(block.ops):
             op = block.ops[idx]
-            op_id = op.desc.id()
             num_cast_ops = 0
 
             if op.type in __amp_skip_ops__:
                 idx += 1
                 continue
             elif is_forward_op(op):
-                if self._is_fp16_op(op_id) == False:
+                if self._is_fp16_op(op.desc.original_id()) == False:
                     num_cast_ops = self._insert_forward_cast_ops(
                         op, idx, block, core.VarDesc.VarType.FP16,
                         core.VarDesc.VarType.FP32, self.dist_context)
-                elif self._is_fp16_op(op_id) == True:
+                elif self._is_fp16_op(op.desc.original_id()) == True:
                     num_cast_ops = self._insert_forward_cast_ops(
                         op, idx, block, core.VarDesc.VarType.FP32,
                         core.VarDesc.VarType.FP16, self.dist_context)
             elif is_backward_op(op):
-                if op_id in dist_op_context.grad_op_id_to_op_id:
-                    if self._is_fp16_op(op_id) == False:
+                if op.desc.original_id() in dist_op_context.grad_op_id_to_op_id:
+                    if self._is_fp16_op(op.desc.original_id()) == False:
                         num_cast_ops = self._insert_backward_cast_ops(
                             op, idx, block, core.VarDesc.VarType.FP16,
                             core.VarDesc.VarType.FP32, self.dist_context)
-                    elif self._is_fp16_op(op_id) == True:
+                    elif self._is_fp16_op(op.desc.original_id()) == True:
                         num_cast_ops = self._insert_backward_cast_ops(
                             op, idx, block, core.VarDesc.VarType.FP32,
                             core.VarDesc.VarType.FP16, self.dist_context)
@@ -282,7 +282,6 @@ def _insert_forward_cast_ops(self, op, idx, block, src_dtype, dst_dtype,
                                  dist_context):
 
         num_cast_ops = 0
-        op_id = op.desc.id()
 
         for in_name in op.input_names:
             if src_dtype == core.VarDesc.VarType.FP32 and _keep_fp32_input(
@@ -300,7 +299,7 @@ def _insert_forward_cast_ops(self, op, idx, block, src_dtype, dst_dtype,
                     cast_name = in_var.name + '.cast_' + _dtype_to_str(
                         dst_dtype)
                     cast_var = block.vars.get(cast_name)
-                    self.forward_input_cast_ops[op_id] += [(
+                    self.forward_input_cast_ops[op.desc.original_id()] += [(
                         cast_name, in_var.name, dst_dtype, src_dtype, in_name)]
 
                     in_var_dist_attr = consume_op_attr.get_input_dist_attr(
@@ -349,8 +348,9 @@ def _insert_backward_cast_ops(self, op, idx, block, src_dtype, dst_dtype,
 
         num_cast_ops = 0
         op_id = op.desc.id()
+        original_id = op.desc.original_id()
         dist_op_context = dist_context.dist_op_context
-        forward_op_id = dist_op_context.grad_op_id_to_op_id[op_id]
+        forward_op_id = dist_op_context.grad_op_id_to_op_id[original_id]
 
         grad_op_attr = dist_context.get_op_dist_attr_for_program(op)
         assert grad_op_attr is not None
diff --git a/python/paddle/distributed/passes/auto_parallel_recompute.py b/python/paddle/distributed/passes/auto_parallel_recompute.py
index 258f46304d189..c6d1685446277 100644
--- a/python/paddle/distributed/passes/auto_parallel_recompute.py
+++ b/python/paddle/distributed/passes/auto_parallel_recompute.py
@@ -315,7 +315,7 @@ def _apply_single_impl(self, main_programs, startup_programs, context):
             # When traversing all grad_ops in reverse, need to set a flag to indicate 
             # whether the ckpt and its segment_descs can be used.
             ckpt_op = op_path[segment[1] - 1]
-            ckpt_ops_dict[ckpt_op.desc.id()] = [True, segment_descs]
+            ckpt_ops_dict[ckpt_op.desc.original_id()] = [True, segment_descs]
 
         # step 4: insert recomputed fwd ops
         ops = main_block.ops
@@ -339,9 +339,9 @@ def _apply_single_impl(self, main_programs, startup_programs, context):
                 _rename_arg_([grad_op.desc], key, var_name_dict[key])
 
             # insert recomputed ops
-            if grad_op.desc.id() in dist_op_context.grad_op_id_to_op_id:
-                fwd_op_id = dist_op_context.grad_op_id_to_op_id[grad_op.desc.id(
-                )]
+            original_id = grad_op.desc.original_id()
+            if original_id in dist_op_context.grad_op_id_to_op_id:
+                fwd_op_id = dist_op_context.grad_op_id_to_op_id[original_id]
                 if fwd_op_id in ckpt_ops_dict and ckpt_ops_dict[fwd_op_id][0]:
                     idx = grad_op.idx
                     while idx - 1 >= 0 and ops[idx - 1].type == "sum":
diff --git a/python/paddle/distributed/passes/ps_trainer_pass.py b/python/paddle/distributed/passes/ps_trainer_pass.py
index 76e617c7dafcf..6112a9a1f45b6 100755
--- a/python/paddle/distributed/passes/ps_trainer_pass.py
+++ b/python/paddle/distributed/passes/ps_trainer_pass.py
@@ -375,12 +375,12 @@ def dag_check_up_and_reorder(program, inputs, outputs):
                 if attrs['use_ps_gpu']:
                     _program.global_block()._insert_op(
                         index=distributed_idx,
-                        type="pull_box_sparse",
+                        type="pull_gpups_sparse",
                         inputs={"Ids": inputs,
                                 'W': w},
                         outputs={"Out": outputs},
                         attrs={
-                            "size": w.shape[1],
+                            "size": [w.shape[1] for i in inputs],
                             "is_distributed": True,
                             "is_sparse": True
                         })
@@ -614,15 +614,24 @@ def _check_conflict(self, other_pass):
         return True
 
     def _add_push_box_sparse_op(self, program):
+        insert_index = -1
+        for idx, op in list(enumerate(program.global_block().ops)):
+            if op.type == "lookup_table_grad":
+                insert_index = idx
         for op in program.global_block().ops:
-            if op.type != "pull_box_sparse":
+            if op.type != "pull_box_sparse" and op.type != "pull_gpups_sparse":
                 continue
             grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
                 op.desc, cpt.to_text(set()), [])
             for op_desc in grad_op_desc:
-                new_op_desc = program.global_block().desc.append_op()
+                new_op_desc = program.global_block().desc._insert_op(
+                    insert_index + 1)
                 new_op_desc.copy_from(op_desc)
                 new_op_desc._set_attr(op_role_attr_name, backward)
+                new_op = paddle.fluid.framework.Operator(program.global_block(),
+                                                         new_op_desc)
+                program.global_block().ops.insert(insert_index + 1, new_op)
+                program.global_block()._sync_with_cpp()
 
     def _remove_optimizer_var(self, program):
         embedding_w = {}
@@ -670,7 +679,7 @@ def _remove_lookup_table_grad_op_and_var(self, program):
                     lookup_table_grad_var[name] = 1
 
         for idx, op in list(enumerate(program.global_block().ops)):
-            if op.type == "pull_box_sparse":
+            if op.type == "pull_box_sparse" or op.type == "pull_gpups_sparse":
                 continue
             for key_name in op.input_names:
                 for var in op.input(key_name):
diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py
index c6df7559a22e8..888d517116a15 100755
--- a/python/paddle/distributed/ps/the_one_ps.py
+++ b/python/paddle/distributed/ps/the_one_ps.py
@@ -1013,12 +1013,13 @@ def sync_strategy_envs():
             if self.context['ps_mode'] == DistributedMode.GEO:
                 self._communicator.init_params(init_params)
             else:
-                if role_id == 0:
-                    self._init_all_params(scopes, send_ctx, dense_map)
+                if not self.context['use_ps_gpu']:
+                    if role_id == 0:
+                        self._init_all_params(scopes, send_ctx, dense_map)
 
             fleet.util.barrier()
-
-        self._pull_all_dense(scopes, send_ctx, dense_map)
+        if not self.context['use_ps_gpu']:
+            self._pull_all_dense(scopes, send_ctx, dense_map)
         fleet.util.barrier()
 
         if self.context['ps_mode'] == DistributedMode.GEO:
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 145ecc83cfc26..ed3e0bc98ed6d 100755
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -1107,8 +1107,10 @@ def update_distop_context(distop_context, op_grad_to_var,
         distop_context.grad_var_to_var[appending_grad_times].update(
             op_grad_to_var)
         for op_desc in grad_op_desc:
-            assert op_desc.id() not in distop_context.grad_op_id_to_op_id
-            distop_context.grad_op_id_to_op_id[op_desc.id()] = op.desc.id()
+            assert op_desc.original_id(
+            ) not in distop_context.grad_op_id_to_op_id
+            distop_context.grad_op_id_to_op_id[op_desc.original_id(
+            )] = op.desc.original_id()
 
     if callbacks is not None:
         assert (isinstance(callbacks, (list, tuple)))
@@ -1255,12 +1257,6 @@ def update_distop_context(distop_context, op_grad_to_var,
     for op_desc in grad_op_descs:
         new_op_desc = target_block.desc.append_op()
         new_op_desc.copy_from(op_desc)
-        # Rebuild the mapping because new_op_desc has a differnt id (Only for auto parallel)
-        if distop_context is not None:
-            if op_desc.id() in distop_context.grad_op_id_to_op_id:
-                distop_context.grad_op_id_to_op_id[new_op_desc.id(
-                )] = distop_context.grad_op_id_to_op_id[op_desc.id()]
-                distop_context.grad_op_id_to_op_id.pop(op_desc.id())
         new_op_desc._set_attr(op_role_attr_name, backward)
         grad_to_var["__current_op_desc__"] = new_op_desc
         if callbacks is not None:
diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
index e543bc1e17b2c..348d914943521 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
@@ -376,13 +376,6 @@ def _update_activations(self, graph):
                 activation = ""
                 if op.op().has_attr("fuse_relu") and op.op().attr("fuse_relu"):
                     activation = "relu"
-                elif op.op().has_attr("fuse_brelu") and op.op().attr(
-                        "fuse_brelu"):
-                    activation = "relu6"
-                    alpha = 6.0
-                    if op.op().has_attr("fuse_brelu_threshold"):
-                        alpha = op.op().attr("fuse_brelu_threshold")
-                    op.set_attr("fuse_alpha", alpha)
                 op.set_attr("fuse_activation", activation)
         return graph
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
index f0dae081dd48f..04e1decd4af68 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
@@ -177,8 +177,7 @@ def prepare_program_conv2d(self, program):
                 'dilations': self.dilations,
                 'use_cudnn': self.use_cudnn,
                 'use_mkldnn': self.use_mkldnn,
-                'data_format': self.data_format,
-                'fuse_brelu': True
+                'data_format': self.data_format
             })
 
     def remove_fuse_activation_attribute(self, graph):
@@ -196,9 +195,6 @@ def check_graph_after_pass(self, graph):
                 self.assertTrue(op.op().has_attr("fuse_activation"))
                 if op.op().has_attr("fuse_relu") and op.op().attr("fuse_relu"):
                     self.assertTrue(op.op().attr("fuse_activation") == "relu")
-                if op.op().has_attr("fuse_brelu") and op.op().attr(
-                        "fuse_brelu"):
-                    self.assertTrue(op.op().attr("fuse_activation") == "relu6")
 
     def test_quant_update_activation(self):
         program = fluid.Program()
diff --git a/python/paddle/fluid/contrib/sparsity/asp.py b/python/paddle/fluid/contrib/sparsity/asp.py
index 30439ad736d26..c366af7237d1b 100644
--- a/python/paddle/fluid/contrib/sparsity/asp.py
+++ b/python/paddle/fluid/contrib/sparsity/asp.py
@@ -20,12 +20,13 @@
 import copy
 import numpy as np
 import paddle
+from paddle.fluid.framework import dygraph_only
 from paddle.fluid import global_scope, program_guard, layers
 from paddle.fluid.initializer import ConstantInitializer
 from paddle.fluid.contrib import sparsity
+from paddle.fluid import core
 from paddle.fluid.contrib.sparsity.supported_layer_list import supported_layers_and_prune_func_map
 from paddle.fluid.contrib.sparsity.supported_layer_list import _default_pruning
-from paddle.fluid import core
 
 OpRole = core.op_proto_and_checker_maker.OpRole
 OP_ROLE_KEY = core.op_proto_and_checker_maker.kOpRoleAttrName()
@@ -35,45 +36,90 @@
 ]
 
 
-def set_excluded_layers(main_program, param_names):
+def set_excluded_layers(param_names, main_program=None):
     r"""
     Set parameter name of layers which would not be pruned as sparse weights.
 
     Args:
+        param_names (list of string): A list contains names of parameters.
         main_program (Program, optional): Program with model definition and its parameters.
-        param_names (list): A list contains names of parameters.
+                                          If None is given, then it would be set as `paddle.static.default_main_program().
+                                          Default is None.
     Examples:
-        .. code-block:: python
-
-            import paddle
-            from paddle.static import sparsity
-
-            paddle.enable_static()
-
-            main_program = paddle.static.Program()
-            startup_program = paddle.static.Program()
-
-            with paddle.static.program_guard(main_program, startup_program):
-                input_data = paddle.static.data(name='data', shape=[None, 128])
-                label = paddle.static.data(name='label', shape=[None, 10])
-                hidden = paddle.static.nn.fc(x=input_data, num_flatten_dims=-1, size=32, activation=None, name="need_sparse_fc")
-                hidden = paddle.static.nn.fc(x=hidden, num_flatten_dims=-1, size=32, activation=None, name="need_dense_fc")
-                prob = paddle.static.nn.fc(x=hidden, num_flatten_dims=-1, size=10, activation=None)
-                loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
-
-                # Setup exluded layers out from ASP workflow.
-                # Please note, excluded_layers must be set before calling `optimizer.minimize()`.
-                sparsity.set_excluded_layers(main_program, ["need_dense_fc"])
-
-                optimizer = paddle.optimizer.SGD(learning_rate=0.1)
-                optimizer = paddle.static.amp.decorate(optimizer )
-                # Calling sparsity.decorate() to wrap minimize() in optimizer, which 
-                # will insert necessary masking operations for ASP workflow.
-                optimizer = sparsity.decorate(optimizer)
-                optimizer.minimize(loss, startup_program)
+        1. Usage of Dynamic Graph
+
+            .. code-block:: python
+
+                import paddle
+
+                class MyLayer(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyLayer, self).__init__()
+                        self.conv1 = paddle.nn.Conv2D(
+                            in_channels=3, out_channels=4, kernel_size=3, padding=2)
+                        self.linear1 = paddle.nn.Linear(4624, 100)
+
+                    def forward(self, img):
+                        hidden = self.conv1(img)
+                        hidden = paddle.flatten(hidden, start_axis=1)
+                        prediction = self.linear1(hidden)
+                        return prediction
+
+                my_layer = MyLayer()
+                optimizer = paddle.optimizer.SGD(
+                    learning_rate=0.01, parameters=my_layer.parameters())
+
+                # Need to set excluded layers before calling decorate
+                paddle.incubate.asp.set_excluded_layers([my_layer.linear1.full_name()])
+
+                optimizer = paddle.incubate.asp.decorate(optimizer)
+
+        2. Usage of Static Graph
+
+            .. code-block:: python
+
+                import paddle
+
+                paddle.enable_static()
+
+                class MyLayer(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyLayer, self).__init__()
+                        self.conv1 = paddle.nn.Conv2D(
+                            in_channels=3, out_channels=4, kernel_size=3, padding=2)
+                        self.linear1 = paddle.nn.Linear(4624, 100)
+
+                    def forward(self, img):
+                        hidden = self.conv1(img)
+                        hidden = paddle.flatten(hidden, start_axis=1)
+                        prediction = self.linear1(hidden)
+                        return prediction
+
+                main_program = paddle.static.Program()
+                startup_program = paddle.static.Program()
+
+                with paddle.static.program_guard(main_program, startup_program):
+                    input_data = paddle.static.data(name='data', shape=[None, 3, 224, 224])
+                    label = paddle.static.data(name='label', shape=[None, 100])
+                    my_layer = MyLayer()
+                    prob = my_layer(input_data)
+                    loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
+
+                    # Setup exluded layers out from ASP workflow.
+                    # Please note, excluded_layers must be set before calling optimizer.minimize().
+                    paddle.incubate.asp.set_excluded_layers([my_layer.linear1.full_name()], main_program)
+
+                    optimizer = paddle.optimizer.SGD(learning_rate=0.1)
+                    optimizer = paddle.static.amp.decorate(optimizer )
+                    # Calling paddle.incubate.asp.decorate() to wrap minimize() in optimizer, which 
+                    # will insert necessary masking operations for ASP workflow.
+                    optimizer = paddle.incubate.asp.decorate(optimizer)
+                    optimizer.minimize(loss, startup_program)
     """
+    if main_program is None:
+        main_program = paddle.static.default_main_program()
     ASPHelper.set_excluded_layers(
-        main_program=main_program, param_names=param_names)
+        param_names=param_names, main_program=main_program)
 
 
 def reset_excluded_layers(main_program=None):
@@ -83,153 +129,310 @@ def reset_excluded_layers(main_program=None):
 
     Args:
         main_program (Program, optional): Program with model definition and its parameters.
-        Examples:
-        .. code-block:: python
+                                          If None is given, then this function would reset all excluded_layers.
+                                          Default is None.
+    Examples:
+        1. Usage of Dynamic Graph
 
-            import paddle
-            from paddle.static import sparsity
+            .. code-block:: python
 
-            paddle.enable_static()
+                import paddle
 
-            main_program = paddle.static.Program()
-            startup_program = paddle.static.Program()
+                class MyLayer(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyLayer, self).__init__()
+                        self.conv1 = paddle.nn.Conv2D(
+                            in_channels=3, out_channels=4, kernel_size=3, padding=2)
+                        self.linear1 = paddle.nn.Linear(4624, 100)
 
-            with paddle.static.program_guard(main_program, startup_program):
-                input_data = paddle.static.data(name='data', shape=[None, 128])
-                label = paddle.static.data(name='label', shape=[None, 10])
-                hidden = paddle.static.nn.fc(x=input_data, num_flatten_dims=-1, size=32, activation=None, name="my_first_fc")
-                hidden = paddle.static.nn.fc(x=hidden, num_flatten_dims=-1, size=32, activation=None, name="my_second_fc")
-                prob = paddle.static.nn.fc(x=hidden, num_flatten_dims=-1, size=10, activation=None)
-                loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
+                    def forward(self, img):
+                        hidden = self.conv1(img)
+                        hidden = paddle.flatten(hidden, start_axis=1)
+                        prediction = self.linear1(hidden)
+                        return prediction
 
-                # Setup exluded layers out from ASP workflow.
-                # Please note, excluded_layers must be set before calling `optimizer.minimize()`.
-                sparsity.set_excluded_layers(main_program, ["my_second_fc"])
-                # Now the weights of "my_second_fc" would not be included in Automatic SParsity's workflow.
+                my_layer = MyLayer()
+                optimizer = paddle.optimizer.SGD(
+                    learning_rate=0.01, parameters=my_layer.parameters())
+
+                # Need to set excluded layers before calling decorate
+                paddle.incubate.asp.set_excluded_layers([my_layer.linear1.full_name()])
+                # Reset excluded_layers, all supported layers would be included into Automatic SParsity's workflow.
+                # Please note, reset_excluded_layers also must be called before calling sparsity.decorate().
+                paddle.incubate.asp.reset_excluded_layers()
+
+                optimizer = paddle.incubate.asp.decorate(optimizer)
+
+        2. Usage of Static Graph
+
+            .. code-block:: python
 
-            # Reset excluded_layers, all FC layers would be included into Automatic SParsity's workflow.
-            # Please note, reset_excluded_layers also must be called before calling `optimizer.minimize()`.
-            sparsity.reset_excluded_layers(main_program)
+                import paddle
+
+                paddle.enable_static()
+
+                class MyLayer(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyLayer, self).__init__()
+                        self.conv1 = paddle.nn.Conv2D(
+                            in_channels=3, out_channels=4, kernel_size=3, padding=2)
+                        self.linear1 = paddle.nn.Linear(4624, 100)
+
+                    def forward(self, img):
+                        hidden = self.conv1(img)
+                        hidden = paddle.flatten(hidden, start_axis=1)
+                        prediction = self.linear1(hidden)
+                        return prediction
+
+                main_program = paddle.static.Program()
+                startup_program = paddle.static.Program()
+
+                with paddle.static.program_guard(main_program, startup_program):
+                    input_data = paddle.static.data(name='data', shape=[None, 3, 224, 224])
+                    label = paddle.static.data(name='label', shape=[None, 100])
+                    my_layer = MyLayer()
+                    prob = my_layer(input_data)
+                    loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
+
+                    # Setup exluded layers out from ASP workflow.
+                    # Please note, excluded_layers must be set before calling optimizer.minimize().
+                    paddle.incubate.asp.set_excluded_layers([my_layer.linear1.full_name()], main_program)
+                    # Reset excluded_layers, all supported layers would be included into Automatic SParsity's workflow.
+                    # Please note, reset_excluded_layers also must be called before calling optimizer.minimize().
+                    paddle.incubate.asp.reset_excluded_layers(main_program)
+
+                    optimizer = paddle.optimizer.SGD(learning_rate=0.1)
+                    optimizer = paddle.static.amp.decorate(optimizer )
+                    # Calling paddle.incubate.asp.decorate() to wrap minimize() in optimizer, which 
+                    # will insert necessary masking operations for ASP workflow.
+                    optimizer = paddle.incubate.asp.decorate(optimizer)
+                    optimizer.minimize(loss, startup_program)
     """
     ASPHelper.reset_excluded_layers(main_program=main_program)
 
 
 def decorate(optimizer):
     r"""
-    Wrap the given optimizer as a OptimizerWithSparsityGuarantee, 
-    which would insert necessary ops for ASP workflows when calling minimize()
+    Wrap the given optimizer as a OptimizerWithSparsityGuarantee,
+    If runnig with dynamic graph mode. ASP would creates mask variables for supported parameters.
+    Else if in static graph mode, ASP would creates mask variables and inserts necessary ops 
+    when calling minimize()
 
     Args:
         optimizer (Optimizer): A Optimizer used for training.
     Returns:
         OptimizerWithSparsityGuarantee: A wrapper for ASP to decorate `minimize` function of the given optimizer.
     Examples:
-        .. code-block:: python
+        1. Usage of Dynamic Graph
 
-            import paddle
-            from paddle.static import sparsity
+            .. code-block:: python
+
+                import paddle
+
+                class MyLayer(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyLayer, self).__init__()
+                        self.conv1 = paddle.nn.Conv2D(
+                            in_channels=3, out_channels=4, kernel_size=3, padding=2)
+                        self.linear1 = paddle.nn.Linear(4624, 32)
+                        self.linear2 = paddle.nn.Linear(32, 32)
+                        self.linear3 = paddle.nn.Linear(32, 10)
 
-            main_program = paddle.static.Program()
-            startup_program = paddle.static.Program()
+                    def forward(self, img):
+                        hidden = self.conv1(img)
+                        hidden = paddle.flatten(hidden, start_axis=1)
+                        hidden = self.linear1(hidden)
+                        hidden = self.linear2(hidden)
+                        prediction = self.linear3(hidden)
+                        return prediction
 
-            paddle.enable_static()
+                my_layer = MyLayer()
+                optimizer = paddle.optimizer.SGD(
+                    learning_rate=0.01, parameters=my_layer.parameters())
 
-            with paddle.static.program_guard(main_program, startup_program):
-                input_data = paddle.static.data(name='data', shape=[None, 128])
-                label = paddle.static.data(name='label', shape=[None, 10])
-                hidden = paddle.static.nn.fc(x=input_data, num_flatten_dims=-1, size=32, activation=None)
-                prob = paddle.static.nn.fc(x=hidden, num_flatten_dims=-1, size=10, activation=None)
-                loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
+                # Calling paddle.incubate.asp.decorate() to wrap step() in optimizer, which 
+                # will apply necessary masking operations for ASP workflow.
+                # In dynamic graph mode, ASP would create related mask variables during decoration.
+                optimizer = paddle.incubate.asp.decorate(optimizer)
 
-                optimizer = paddle.optimizer.SGD(learning_rate=0.1)
-                optimizer = sparsity.decorate(optimizer)
-                # if do sparse training with Fleet, please replace above decorate with:
-                # strategy = paddle.distributed.fleet.DistributedStrategy()
-                # strategy.asp = True
-                # optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        2. Usage of Static Graph
 
-                optimizer.minimize(loss, startup_program)
+            .. code-block:: python
+
+                import paddle
+
+                paddle.enable_static()
+
+                class MyLayer(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyLayer, self).__init__()
+                        self.conv1 = paddle.nn.Conv2D(
+                            in_channels=3, out_channels=4, kernel_size=3, padding=2)
+                        self.linear1 = paddle.nn.Linear(4624, 100)
+
+                    def forward(self, img):
+                        hidden = self.conv1(img)
+                        hidden = paddle.flatten(hidden, start_axis=1)
+                        prediction = self.linear1(hidden)
+                        return prediction
+
+                main_program = paddle.static.Program()
+                startup_program = paddle.static.Program()
+
+                with paddle.static.program_guard(main_program, startup_program):
+                    input_data = paddle.static.data(name='data', shape=[None, 3, 224, 224])
+                    label = paddle.static.data(name='label', shape=[None, 100])
+                    my_layer = MyLayer()
+                    prob = my_layer(input_data)
+                    loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
+
+                    optimizer = paddle.optimizer.SGD(learning_rate=0.1)
+                    # Calling paddle.incubate.asp.decorate() to wrap minimize() in optimizer, which 
+                    # will insert necessary masking operations for ASP workflow.
+                    # In static graph mode, ASP creates related mask variables 
+                    # during minimize().
+                    optimizer = paddle.incubate.asp.decorate(optimizer)
+                    optimizer.minimize(loss, startup_program)
     """
     return ASPHelper.decorate(optimizer)
 
 
-def prune_model(main_program=None,
-                n=2,
-                m=4,
-                mask_algo='mask_1d',
-                with_mask=True):
+def prune_model(model, n=2, m=4, mask_algo='mask_1d', with_mask=True):
     r"""
-    Pruning parameters of supported layers in :attr:`main_program` via 
+    Pruning parameters of supported layers in :attr:`model` via 
     specified mask generation function given by :attr:`mask_algo`. This 
     function supports both training and inference controlled by :attr:`with_mask`.
     If :attr:`with_mask` is True, it would also prune parameter related ASP mask Variables,
     else only prunes parameters.
 
-    *Note*: If parameters are supported and in FP16, please set :attr:`n`=2, :attr:`m`=4, 
-    if they in FP32, then :attr:`n`=1, :attr:`m`=2` to further enable Sparse Tensor Core acceleration.
-
-    *Note*: If calling this function with :attr:`with_mask`, it should call `OptimizerWithSparsityGuarantee.minimize` 
+    *Note*: (Static graph mode) If calling this function with :attr:`with_mask`, it should call `OptimizerWithSparsityGuarantee.minimize` 
     and initialization (`exe.run(startup_program`)) before (For successfully obtain mask Variable). 
     Typically set `with_mask` as true for training (have called `OptimizerWithSparsityGuarantee.minimize`) and false for 
-    inference only. To obtain OptimizerWithSparsityGuarantee, please see `sparsity.decoreate()`.
+    inference only. To obtain OptimizerWithSparsityGuarantee, please see `paddle.incubate.asp.decoreate()`.
 
     Args:
-        main_program (Program, optional): Program with model definition and its parameters. Default is `paddle.static.default_main_program()
-        n (int): n of `n:m` sparse pattern.
-        m (int): m of `n:m` sparse pattern.
+        model (Program|nn.Layer): Program with model definition and its parameters, or a object of `paddle.nn.Layer`.
+        n (int, optional): n of `n:m` sparse pattern. Default is 2.
+        m (int, optional): m of `n:m` sparse pattern. Default is 4.
         mask_algo (string, optional): The function name to generate spase mask. Default is `mask_1d`.
                                       The vaild inputs should be one of 'mask_1d', 'mask_2d_greedy' and 'mask_2d_best'.
         with_mask (bool, optional): To prune mask Variables related to parameters or not. Ture is purning also, False is not. Defalut is True.
     Returns:
         dictionary: A dictionary with key: `parameter name` (string) and value: its corresponding mask Variable.
     Examples:
-        .. code-block:: python
-
-            import paddle
-            from paddle.static import sparsity
-
-            paddle.enable_static()
-
-            main_program = paddle.static.Program()
-            startup_program = paddle.static.Program()
-
-            with paddle.static.program_guard(main_program, startup_program):
-                input_data = paddle.static.data(name='data', shape=[None, 128])
-                label = paddle.static.data(name='label', shape=[None, 10])
-                hidden = paddle.static.nn.fc(x=input_data, num_flatten_dims=-1, size=32, activation=None, name="need_sparse_fc")
-                hidden = paddle.static.nn.fc(x=hidden, num_flatten_dims=-1, size=32, activation=None, name="need_dense_fc")
-                prob = paddle.static.nn.fc(x=hidden, num_flatten_dims=-1, size=10, activation=None)
-                loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
-
-                # Setup exluded layers out from ASP workflow.
-                # Please note, excluded_layers must be set before calling `optimizer.minimize()`.
-                sparsity.set_excluded_layers(main_program, ["need_dense_fc"])
+        1. Usage of Dynamic Graph
 
-                optimizer = paddle.optimizer.SGD(learning_rate=0.1)
-                optimizer = paddle.static.amp.decorate(optimizer )
-                # Calling sparsity.decorate() to wrap minimize() in optimizer, which 
-                # will insert necessary masking operations for ASP workflow.
-                optimizer = sparsity.decorate(optimizer)
-                optimizer.minimize(loss, startup_program)
+            .. code-block:: python
 
-            device = paddle.device.get_device()
-            place = paddle.set_device(device)
+                import paddle
+                import numpy as np
+
+                class MyLayer(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyLayer, self).__init__()
+                        self.conv1 = paddle.nn.Conv2D(
+                            in_channels=3, out_channels=4, kernel_size=3, padding=2)
+                        self.linear1 = paddle.nn.Linear(4624, 32)
+                        self.linear2 = paddle.nn.Linear(32, 32)
+                        self.linear3 = paddle.nn.Linear(32, 10)
+
+                    def forward(self, img):
+                        hidden = self.conv1(img)
+                        hidden = paddle.flatten(hidden, start_axis=1)
+                        hidden = self.linear1(hidden)
+                        hidden = self.linear2(hidden)
+                        prediction = self.linear3(hidden)
+                        return prediction
+
+                my_layer = MyLayer()
+                loss_fn = paddle.nn.MSELoss(reduction='mean')
+
+                optimizer = paddle.optimizer.SGD(
+                    learning_rate=0.01, parameters=my_layer.parameters())
+
+                # Calling paddle.incubate.asp.decorate() to wrap step() in optimizer, which 
+                # will apply necessary masking operations for ASP workflow.
+                # In dynamic graph mode, ASP would create related mask variables during decoration.
+                optimizer = paddle.incubate.asp.decorate(optimizer)
+
+                # Must call paddle.incubate.asp.decorate() first before calling paddle.incubate.asp.prune_model()
+                paddle.incubate.asp.prune_model(my_layer, mask_algo='mask_2d_best')
+
+                for i in range(10):
+                    imgs = paddle.to_tensor(
+                        np.random.randn(64, 3, 32, 32),
+                        dtype='float32', stop_gradient=False)
+                    labels = paddle.to_tensor(
+                        np.random.randint(10, size=(64, 1)),
+                        dtype='float32', stop_gradient=False)
+                    output = my_layer(imgs)
+                    loss = loss_fn(output, labels)
+                    loss.backward()
+                    optimizer.step()
+                    optimizer.clear_grad()
+
+        2. Usage of Static Graph
 
-            exe = paddle.static.Executor(place)
-            exe.run(startup_program)
+            .. code-block:: python
 
-            # Must call `exe.run(startup_program)` first before calling `sparsity.prune_model`
-            sparsity.prune_model(main_program, mask_algo='mask_2d_best')
+                import paddle
+                import numpy as np
+
+                paddle.enable_static()
+
+                class MyLayer(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyLayer, self).__init__()
+                        self.conv1 = paddle.nn.Conv2D(
+                            in_channels=3, out_channels=4, kernel_size=3, padding=2)
+                        self.linear1 = paddle.nn.Linear(4624, 32)
+                        self.linear2 = paddle.nn.Linear(32, 32)
+                        self.linear3 = paddle.nn.Linear(32, 10)
+
+                    def forward(self, img):
+                        hidden = self.conv1(img)
+                        hidden = paddle.flatten(hidden, start_axis=1)
+                        hidden = self.linear1(hidden)
+                        hidden = self.linear2(hidden)
+                        prediction = self.linear3(hidden)
+                        return prediction
+
+                main_program = paddle.static.Program()
+                startup_program = paddle.static.Program()
+
+                with paddle.static.program_guard(main_program, startup_program):
+                    input_data = paddle.static.data(name='data', shape=[None, 3, 32, 32])
+                    label = paddle.static.data(name='label', shape=[None, 1])
+                    my_layer = MyLayer()
+                    prob = my_layer(input_data)
+                    loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
+
+                    optimizer = paddle.optimizer.SGD(learning_rate=0.1)
+                    # Calling paddle.incubate.asp.decorate() to wrap minimize() in optimizer, which 
+                    # will insert necessary masking operations for ASP workflow.
+                    # In static graph mode, ASP creates related mask variables 
+                    # during minimize().
+                    optimizer = paddle.incubate.asp.decorate(optimizer)
+                    optimizer.minimize(loss, startup_program)
+
+                device = paddle.device.get_device()
+                place = paddle.set_device(device)
+
+                exe = paddle.static.Executor(place)
+                exe.run(startup_program)
+
+                # Must call exe.run(startup_program) first before calling paddle.asp.prune_model()
+                paddle.incubate.asp.prune_model(my_layer, mask_algo='mask_2d_best')
+                # it also be accepted to call 
+                # paddle.incubate.asp.prune_model(main_program, mask_algo='mask_2d_best')
+
+                for i in range(10):
+                    imgs = np.random.randn(64, 3, 32, 32).astype('float32')
+                    labels = np.random.randint(10, size=(64, 1)).astype('float32')
+                    exe.run(main_program, feed={'data':imgs, 'label':labels})
     """
-    if main_program is not None and hasattr(
-            main_program,
-            "distributed_info_") and main_program.distributed_info_[
-                "sharding_degree"] > 1 and paddle.fluid.is_compiled_with_cuda():
-        gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
-        place = paddle.CUDAPlace(gpu_id)
-    else:
-        device = paddle.device.get_device()
-        place = paddle.set_device(device)
+    device = paddle.device.get_device()
+    place = paddle.set_device(device)
 
     MaskAlgo_mapping = {
         'mask_1d': sparsity.MaskAlgo.MASK_1D,
@@ -237,11 +440,26 @@ def prune_model(main_program=None,
         'mask_2d_best': sparsity.MaskAlgo.MASK_2D_BEST
     }
     assert (mask_algo in MaskAlgo_mapping), \
-           'The "mask_algo" should be one of ["mask_1d", "mask_2d_greedy", "mask_2d_best"]'
+        'The "mask_algo" should be one of ["mask_1d", "mask_2d_greedy", "mask_2d_best"]'
+
+    prune_func = None
+    if isinstance(model, paddle.nn.Layer):
+        prune_func = ASPHelper.prune_model_by_layer
+    elif isinstance(model, paddle.static.Program):
+        prune_func = ASPHelper.prune_model_by_program
+        if hasattr(model, "distributed_info_") and \
+           model.distributed_info_["sharding_degree"] > 1 and \
+           paddle.fluid.is_compiled_with_cuda():
+            gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
+            place = paddle.CUDAPlace(gpu_id)
+    else:
+        raise TypeError(
+            "model should be paddle.nn.Layer or paddle.static.Program, but got {}".
+            format(type(model)))
 
-    return ASPHelper.prune_model(
-        place=place,
-        main_program=main_program,
+    return prune_func(
+        place,
+        model,
         n=n,
         m=m,
         mask_algo=MaskAlgo_mapping[mask_algo],
@@ -300,7 +518,7 @@ class ASPHelper(object):
     __asp_info = {}
 
     @classmethod
-    def set_excluded_layers(cls, main_program, param_names):
+    def set_excluded_layers(cls, param_names, main_program):
         r"""
         This is the implementation of `sparsity.set_excluded_layers`, for details please see explanation in `sparsity.set_excluded_layers`.
         """
@@ -313,8 +531,8 @@ def reset_excluded_layers(cls, main_program=None):
         This is the implementation of `sparsity.reset_excluded_layers`, for details please see explanation in `sparsity.reset_excluded_layers`.
         """
         if main_program is None:
-            for asp_info in cls.__asp_info:
-                asp_info.reset_excluded_layers()
+            for prog in cls.__asp_info:
+                cls.__asp_info[prog].reset_excluded_layers()
         else:
             cls._get_program_asp_info(main_program).reset_excluded_layers()
 
@@ -323,16 +541,25 @@ def decorate(optimizer):
         r"""
         This is the implementation of `sparsity.decorate`, for details please see explanation in `sparsity.decorate`.
         """
+        if paddle.in_dynamic_mode():
+            # main_prog and startup_prog would be used with paddle.static.program_guard
+            # to create ASP masks. Moreover, main_prog is a key to map paddle.static.Program
+            # to its own ASP informantion, like ASP mask variables. For dynamic graph, we use
+            # default_main_program as the key.
+            main_prog = paddle.static.default_main_program()
+            startup_prog = paddle.static.default_startup_program()
+            ASPHelper._create_mask_variables(main_prog, startup_prog,
+                                             optimizer._parameter_list)
         return OptimizerWithSparsityGuarantee(optimizer)
 
     @classmethod
-    def prune_model(cls,
-                    place,
-                    main_program=None,
-                    n=2,
-                    m=4,
-                    mask_algo=sparsity.MaskAlgo.MASK_1D,
-                    with_mask=True):
+    def prune_model_by_program(cls,
+                               place,
+                               main_program=None,
+                               n=2,
+                               m=4,
+                               mask_algo=sparsity.MaskAlgo.MASK_1D,
+                               with_mask=True):
         r"""
         This is the implementation of `sparsity.prune_model`, for details please see explanation in `sparsity.prune_model`.
         """
@@ -366,9 +593,63 @@ def prune_model(cls,
                         np.array(weight_mask_tensor).dtype)
                     weight_mask_tensor.set(weight_sparse_mask, place)
                 asp_info.update_masks(param.name, weight_sparse_mask)
-
         return asp_info.masks.copy()
 
+    @classmethod
+    def prune_model_by_layer(cls,
+                             place,
+                             layer,
+                             n=2,
+                             m=4,
+                             mask_algo=sparsity.MaskAlgo.MASK_1D,
+                             with_mask=True):
+        r"""
+        This is the implementation of `sparsity.prune_model`, for details please see explanation in `sparsity.prune_model`.
+        """
+        if paddle.in_dynamic_mode():
+            main_program = paddle.static.default_main_program()
+            asp_info = cls._get_program_asp_info(main_program)
+
+            for param in layer.parameters():
+                if ASPHelper._is_supported_layer(main_program, param.name):
+                    weight_nparray = param.numpy()
+
+                    prune_func = ASPHelper._get_prune_func_by_name(param.name)
+
+                    weight_pruned_nparray, weight_sparse_mask = \
+                        prune_func(weight_nparray, m, n, mask_algo, param.name)
+
+                    weight_pruned_nparray = weight_pruned_nparray.astype(
+                        weight_nparray.dtype)
+                    param.set_value(weight_pruned_nparray)
+
+                    if with_mask:
+                        weight_mask_param = asp_info.mask_vars.get(param.name,
+                                                                   None)
+                        assert weight_mask_param is not None, \
+                            'Cannot find {} variable, please call sparsity.decorate() to' \
+                            ' decorate your optimizer first!'.format(ASPHelper._get_mask_name(param.name))
+                        weight_mask_param.set_value(weight_sparse_mask)
+
+                    asp_info.update_masks(param.name, weight_sparse_mask)
+
+            return asp_info.masks.copy()
+        else:
+            # This for loop is only used to obtain Block and Program from
+            # first parameters.
+            target_program = None
+            for param in layer.parameters():
+                target_program = param.block.program
+            assert target_program is not None, \
+                    'Cannot get paddle.static.Program from Paddle.nn.Layer.'
+            return ASPHelper.prune_model_by_program(
+                place,
+                target_program,
+                n=n,
+                m=m,
+                mask_algo=mask_algo,
+                with_mask=with_mask)
+
     @staticmethod
     def _get_mask_name(param_name):
         r"""
@@ -393,13 +674,15 @@ def _get_not_ASP_relevant_vars(main_program):
         """
         var_list = []
         for param in main_program.global_block().all_parameters():
-            if ASPHelper.MASK_APPENDDED_NAME not in param.name:
+            param_name_list = param.name.split('.')
+
+            if ASPHelper.MASK_APPENDDED_NAME not in param_name_list:
                 var_list.append(param)
         return var_list
 
     @classmethod
     def _get_program_asp_info(cls, main_program):
-        if not main_program in cls.__asp_info:
+        if main_program not in cls.__asp_info:
             cls.__asp_info[main_program] = ProgramASPInfo()
         return cls.__asp_info[main_program]
 
@@ -508,14 +791,37 @@ def _minimize(cls,
 
         optimizer_ops, params_and_grads = optimizer.minimize(
             loss, startup_program, parameter_list, no_grad_set=no_grad_set)
-        cls._create_mask_variables(main_program, startup_program,
-                                   params_and_grads)
-        cls._insert_sparse_mask_ops(main_program, params_and_grads)
+
+        params_only = [pg[0] for pg in params_and_grads]
+        cls._create_mask_variables(main_program, startup_program, params_only)
+        cls._insert_sparse_mask_ops(main_program, params_only)
         return optimizer_ops, params_and_grads
 
     @classmethod
-    def _create_mask_variables(cls, main_program, startup_program,
-                               params_and_grads):
+    @dygraph_only
+    def _step(cls, optimizer):
+        r"""
+        This function is a decorator of `step` function in `Optimizer`.
+        There are three steps:
+
+        1. Call :attr:`optimizer`.step()
+        2. Mask parameters with sparse masks.
+
+        *Note*: Please use `ASP.decorate` instead when applying distributed training with `Fleet`. 
+        (Due to there is a invisiable graphs optimization in `Fleet.minimize()` which make training graph 
+        cannot be modified anymore.)
+
+        Args:
+            optimizer (Optimizer): A Optimizer used for training.
+        """
+        optimizer.step()
+        main_prog = paddle.static.default_main_program()
+        with paddle.fluid.dygraph.no_grad():
+            ASPHelper._insert_sparse_mask_ops(main_prog,
+                                              optimizer._parameter_list)
+
+    @classmethod
+    def _create_mask_variables(cls, main_program, startup_program, params):
         r"""
         Create sparse mask Tensors according to supported layers in :attr:`main_program`.
         This function is called in second step of `ASPHelper._minimize`
@@ -523,48 +829,45 @@ def _create_mask_variables(cls, main_program, startup_program,
         Args:
             main_program (Program): Program with model definition and its parameters.
             startup_program (Program): Program for initializing parameters.
-            params_and_grads (list): Variable pairs of parameters and their gradients.
+            params (list): Variable parameters.
         """
         asp_info = cls._get_program_asp_info(main_program)
         with program_guard(main_program, startup_program):
-            for param_and_grad in params_and_grads:
-                if ASPHelper._is_supported_layer(main_program,
-                                                 param_and_grad[0].name):
-                    mask_param = layers.create_parameter(
-                        name=ASPHelper._get_mask_name(param_and_grad[0].name),
-                        shape=param_and_grad[0].shape,
-                        dtype=param_and_grad[0].dtype,
-                        default_initializer=ConstantInitializer(value=1.0))
-                    mask_param.stop_gradient = True
-                    mask_param.trainable = False
-                    asp_info.update_mask_vars(param_and_grad[0].name,
-                                              mask_param)
+            for param in params:
+                if ASPHelper._is_supported_layer(main_program, param.name):
+                    if param.name not in asp_info.mask_vars:
+                        mask_param = layers.create_parameter(
+                            name=ASPHelper._get_mask_name(param.name),
+                            shape=param.shape,
+                            dtype=param.dtype,
+                            default_initializer=ConstantInitializer(value=1.0))
+                        mask_param.stop_gradient = True
+                        mask_param.trainable = False
+                        asp_info.update_mask_vars(param.name, mask_param)
 
     @classmethod
-    def _insert_sparse_mask_ops(cls, main_program, param_grads):
+    def _insert_sparse_mask_ops(cls, main_program, params):
         r"""
         Insert masking ops in the end of parameters update.
         This function is called in third step of `ASPHelper._minimize`
 
         Args:
             main_program (Program): Program with model definition and its parameters.
-            params_and_grads (list): Variable pairs of parameters and their gradients.
+            params (list): Variable parameters.
         """
         block = main_program.global_block()
         asp_info = cls._get_program_asp_info(main_program)
-        for param_grad in param_grads:
-            if param_grad[0].name in asp_info.mask_vars:
+        for param in params:
+            if param.name in asp_info.mask_vars:
                 block.append_op(
                     type='elementwise_mul',
-                    inputs={
-                        "X": param_grad[0],
-                        'Y': asp_info.mask_vars[param_grad[0].name]
-                    },
-                    outputs={'Out': param_grad[0]},
+                    inputs={"X": param,
+                            'Y': asp_info.mask_vars[param.name]},
+                    outputs={'Out': param},
                     attrs={
                         'axis': -1,
                         'use_mkldnn': False,
-                        OP_ROLE_KEY: OpRole.Optimize
+                        OP_ROLE_KEY: int(OpRole.Optimize)
                     })
 
 
@@ -579,8 +882,9 @@ class OptimizerWithSparsityGuarantee(object):
 
     def __init__(self, optimizer):
         self._optimizer = optimizer
-        self._learning_rate = optimizer._learning_rate
-        self._learning_rate_map = optimizer._learning_rate_map
+
+    def __getattr__(self, item):
+        return getattr(self._optimizer, item)
 
     def minimize(self,
                  loss,
@@ -605,3 +909,55 @@ def minimize(self,
             startup_program=startup_program,
             parameter_list=parameter_list,
             no_grad_set=no_grad_set)
+
+    @dygraph_only
+    def step(self):
+        r"""
+        This function is a decorator of `step` function in `Optimizer`.
+        There are three steps:
+
+        1. Call :attr:`optimizer`.step()
+        2. Mask parameters with sparse masks.
+
+        *Note*: Please use `ASP.decorate` instead when applying distributed training with `Fleet`. 
+        (Due to there is a invisiable graphs optimization in `Fleet.minimize()` which make training graph 
+        cannot be modified anymore.)
+
+        Args:
+            optimizer (Optimizer): A Optimizer used for training.
+        """
+        ASPHelper._step(self._optimizer)
+
+    @dygraph_only
+    def state_dict(self):
+        r"""
+        This function is a decorator of `state_dict` function in `Optimizer`.
+
+        Returns:
+            state_dict(dict) : dict contains all the Tensor used by optimizer
+        """
+        state_dict = self._optimizer.state_dict()
+        asp_info = ASPHelper._get_program_asp_info(
+            paddle.static.default_main_program())
+        for param_name, var in asp_info.mask_vars.items():
+            state_dict.update({ASPHelper._get_mask_name(param_name): var})
+        return state_dict
+
+    @dygraph_only
+    def set_state_dict(self, state_dict):
+        r"""
+        This function is a decorator of `set_state_dict` function in `Optimizer`.
+        Args: 
+            state_dict(dict) : Dict contains all the Tensor needed by optimizer
+        Return:
+            None
+        """
+        asp_info = ASPHelper._get_program_asp_info(
+            paddle.static.default_main_program())
+        for param_name, var in asp_info.mask_vars.items():
+            param_mask_name = ASPHelper._get_mask_name(param_name)
+            assert param_mask_name in state_dict, \
+                "The {} is not found.".format(param_mask_name)
+            var.set_value(state_dict[param_mask_name])
+            asp_info.update_masks(param_name, var.numpy())
+        return self._optimizer.set_state_dict(state_dict)
diff --git a/python/paddle/fluid/contrib/sparsity/utils.py b/python/paddle/fluid/contrib/sparsity/utils.py
index 8b8c043bc4bad..a28f7fc2b4ed6 100644
--- a/python/paddle/fluid/contrib/sparsity/utils.py
+++ b/python/paddle/fluid/contrib/sparsity/utils.py
@@ -94,13 +94,12 @@ def calculate_density(x):
         float: The density of :attr:`x`.
     Examples:
         .. code-block:: python
-
+          import paddle
           import numpy as np
-          import paddle.static.sparsity as sparsity
 
           x = np.array([[0, 1, 3, 0],
                         [1, 1, 0, 1]])
-          sparsity.calculate_density(x) # 0.625
+          paddle.incubate.asp.calculate_density(x) # 0.625
     """
     x_flattened = x.flatten()
     return float(np.nonzero(x_flattened)[0].size) / x_flattened.size
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/assert_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/assert_transformer.py
index e2fcf4f2c2712..4d5076108cd31 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/assert_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/assert_transformer.py
@@ -37,7 +37,7 @@ def transform(self):
 
     def visit_Assert(self, node):
         convert_assert_node = gast.parse(
-            'paddle.jit.dy2static.convert_assert({test}, {msg})'.format(
+            '_jst.convert_assert({test}, {msg})'.format(
                 test=ast_to_source_code(node.test),
                 msg=ast_to_source_code(node.msg)
                 if node.msg else "")).body[0].value
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py
index a80dfa11402c5..c16d1ff17f707 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py
@@ -71,7 +71,7 @@ def visit_Call(self, node):
         if PDB_SET in func_str:
             return node
 
-        new_func_str = "paddle.jit.dy2static.convert_call({})".format(func_str)
+        new_func_str = "_jst.convert_call({})".format(func_str)
         new_func_ast = gast.parse(new_func_str).body[0].value
         node.func = new_func_ast
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/cast_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/cast_transformer.py
index ef2d062d2d018..50733e4d896e4 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/cast_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/cast_transformer.py
@@ -39,8 +39,8 @@ def visit_Call(self, node):
         func_str = ast_to_source_code(node.func).strip()
         if func_str in self._castable_type and len(node.args) > 0:
             args_str = ast_to_source_code(node.args[0]).strip()
-            new_func_str = "paddle.jit.dy2static.convert_var_dtype({}, '{}')".format(
-                args_str, func_str)
+            new_func_str = "_jst.convert_var_dtype({}, '{}')".format(args_str,
+                                                                     func_str)
             new_node = gast.parse(new_func_str).body[0].value
             return new_node
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
index 8fc5a691d212c..157822430d234 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
@@ -536,7 +536,7 @@ def create_name_nodes(name_ids):
     return_vars = create_name_nodes(return_name_ids)
 
     convert_ifelse_layer = gast.parse(
-        'paddle.jit.dy2static.convert_ifelse('
+        '_jst.convert_ifelse('
         '{pred}, {true_fn}, {false_fn}, {true_args}, {false_args}, {return_vars})'.
         format(
             pred=ast_to_source_code(pred),
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/list_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/list_transformer.py
index e62def897d2eb..0951635162e5e 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/list_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/list_transformer.py
@@ -129,7 +129,7 @@ def _transform_slice_to_tensor_write(self, node):
         elif slice_is_num(target_node):
             value_code = ast_to_source_code(node.value)
             i = "paddle.cast(" \
-                "x=paddle.jit.dy2static.to_static_variable({})," \
+                "x=_jst.to_static_variable({})," \
                 "dtype='int64')".format(ast_to_source_code(slice_node))
             assign_code = "{} = paddle.tensor.array_write(x={}, i={}, array={})" \
                 .format(target_name, value_code, i, target_name)
@@ -252,7 +252,7 @@ def _replace_pop(self, node):
         # 2. pop stmt for a list or dict if len(args_str) == 1
         # 3. pop stmt for a dict if len(args_str) == 2
         if len(args_str) <= 2:
-            new_pop_str = "paddle.jit.dy2static.convert_pop({}, {})"\
+            new_pop_str = "_jst.convert_pop({}, {})"\
                 .format(target_str, ",".join(args_str))
             new_pop_node = gast.parse(new_pop_str).body[0].value
             return new_pop_node
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/logical_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/logical_transformer.py
index e5c093f9a9255..bd573521f1b4e 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/logical_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/logical_transformer.py
@@ -57,8 +57,7 @@ def visit_UnaryOp(self, node):
         self.generic_visit(node)
         if isinstance(node.op, gast.Not):
             arg = ast_to_source_code(node.operand)
-            new_node_str = "paddle.jit.dy2static.convert_logical_not({})".format(
-                arg)
+            new_node_str = "_jst.convert_logical_not({})".format(arg)
             # NOTE: gast.parse returns Module(body=[expr(value=...)])
             new_node = gast.parse(new_node_str).body[0].value
             return new_node
@@ -67,13 +66,12 @@ def visit_UnaryOp(self, node):
     def visit_Compare(self, node):
         self.generic_visit(node)
         left_str = ast_to_source_code(node.left).strip()
-        if left_str.startswith("paddle.jit.dy2static.convert_var_shape"):
+        if left_str.startswith("_jst.convert_var_shape"):
             # check left and comparators are all converted var shape
             compare_arg_strs = left_str
             for i, comparator in enumerate(node.comparators):
                 comparator_str = ast_to_source_code(comparator).strip()
-                if not comparator_str.startswith(
-                        "paddle.jit.dy2static.convert_var_shape"):
+                if not comparator_str.startswith("_jst.convert_var_shape"):
                     return node
                 op_str = cmpop_node_to_str(node.ops[i])
                 compare_arg_strs += (", '" + op_str + "', " + comparator_str)
@@ -81,7 +79,7 @@ def visit_Compare(self, node):
             # Now all left and comparators are converted shape
             # Replace some comparsion operation because of difference between
             # Python and Paddle
-            new_node_str = "paddle.jit.dy2static.convert_shape_compare({})".format(
+            new_node_str = "_jst.convert_shape_compare({})".format(
                 compare_arg_strs)
             new_node = gast.parse(new_node_str).body[0].value
             return new_node
@@ -119,7 +117,7 @@ def _create_bool_op_node(self, nodes, api_type):
             nodes = [pre_logic_node] + [post_logic_node]
 
         args = [ast_to_source_code(child) for child in nodes]
-        new_node_str = "paddle.jit.dy2static.convert_logical_{}(lambda:{}, lambda:{})".format(
+        new_node_str = "_jst.convert_logical_{}(lambda:{}, lambda:{})".format(
             api_type, args[0], args[1])
         # NOTE: gast.parse return Module(body=[expr(...)])
         new_node = gast.parse(new_node_str).body[0].value
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
index 4e5a3f7b70851..8014a00bff983 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
@@ -89,7 +89,7 @@ def create_while_nodes(condition_name, body_name, loop_var_names):
         else:
             assign_loop_var_names.append(name)
 
-    while_func_name = "paddle.jit.dy2static.convert_while_loop"
+    while_func_name = "_jst.convert_while_loop"
     while_node_str = "[{}] = {}({}, {}, [{}])".format(
         ",".join(assign_loop_var_names), while_func_name, condition_name,
         body_name, ",".join(loop_var_names))
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
index 7960617369e3f..f045d01c99bab 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
@@ -50,6 +50,5 @@ def visit_Print(self, node):
         return gast.Expr(value=convert_print_node)
 
     def _create_print_node(self, print_args):
-        convert_print_func = gast.parse(
-            'paddle.jit.dy2static.convert_print').body[0].value
+        convert_print_func = gast.parse('_jst.convert_print').body[0].value
         return gast.Call(func=convert_print_func, args=print_args, keywords=[])
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index b860740f71b25..2efb6965085de 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -197,10 +197,12 @@ def from_func_and_args(cls, function_spec, args, kwargs, class_instance):
     def __hash__(self):
         error_msg = "Arguments to a `@paddle.jit.to_static` must be a hashable Python objects (or nested structures of these types)."
         with_hook = self.kwargs.get("with_hook", False)
-        return hash((id(self.function_spec),
-                     make_hashable(self.input_args_with_spec, error_msg),
-                     make_hashable(self.input_kwargs_with_spec, error_msg),
-                     self._spec_names_id, self.class_instance, with_hook))
+        is_train = self.kwargs.get("is_train", False)
+        return hash(
+            (id(self.function_spec),
+             make_hashable(self.input_args_with_spec, error_msg),
+             make_hashable(self.input_kwargs_with_spec, error_msg),
+             self._spec_names_id, self.class_instance, with_hook, is_train))
 
     def __eq__(self, other):
         return (type(self) is type(other)) and hash(self) == hash(other)
@@ -357,7 +359,7 @@ def __call__(self, *args, **kwargs):
 
         try:
             concrete_program, partial_program_layer = self.get_concrete_program(
-                *args, **kwargs)
+                *args, **kwargs, is_train=self._is_train_mode())
 
             # 3. synchronize self.training attribute.
             if isinstance(self._class_instance, layers.Layer):
@@ -383,6 +385,12 @@ def __call__(self, *args, **kwargs):
                     " if you can't handle this {} yourself.".format(type(e)))
                 raise e
 
+    def _is_train_mode(self):
+        if self._class_instance is not None:
+            return self._class_instance.training
+        else:
+            return self._training
+
     def _call_dygraph_function(self, *args, **kwargs):
         """
         Calls dygraph function directly and returns the outputs.
@@ -415,6 +423,8 @@ def get_concrete_program(self, *args, **kwargs):
         """
 
         with_hook = kwargs.get("with_hook", False)
+        is_train = kwargs.get("is_train", True)
+        if "is_train" in kwargs: kwargs.pop("is_train")
         if "with_hook" in kwargs: kwargs.pop("with_hook")
         # 1. unify args/kwargs and replace Tensor with InputSpec
         if len(args) != len(self._function_spec.args_name):
@@ -430,7 +440,8 @@ def get_concrete_program(self, *args, **kwargs):
             input_kwargs_with_spec,
             self._class_instance,
             **self._kwargs,
-            with_hook=with_hook)
+            with_hook=with_hook,
+            is_train=is_train)
 
         # 3. check whether hit the cache or build a new program for the input arguments
         concrete_program, partial_program_layer = self._program_cache[cache_key]
@@ -525,7 +536,9 @@ def concrete_program_specify_input_spec(self,
             has_input_spec = (desired_input_spec is not None)
             if has_input_spec:
                 concrete_program, _ = self.get_concrete_program(
-                    *desired_input_spec, with_hook=with_hook)
+                    *desired_input_spec,
+                    with_hook=with_hook,
+                    is_train=self._is_train_mode())
                 return concrete_program
             else:
                 raise ValueError(
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py
index 0c7a8bf421a12..8ac659dbead99 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py
@@ -336,7 +336,7 @@ def _replace_return_in_stmt_list(self, stmt_list, return_node, return_name,
         # Here assume that the parent node of return is gast.If
         if isinstance(parent_node_of_return, gast.If):
             # Prepend control flow boolean nodes such as '__return@1 = True'
-            node_str = "{} = paddle.jit.dy2static.create_bool_as_type({}, True)".format(
+            node_str = "{} = _jst.create_bool_as_type({}, True)".format(
                 return_name,
                 ast_to_source_code(parent_node_of_return.test).strip())
 
@@ -449,7 +449,7 @@ def _replace_after_node_to_if_in_stmt_list(
         # Here assume that the parent node of return is gast.If
         if isinstance(parent_node_of_return, gast.If):
             # Prepend control flow boolean nodes such as '__return@1 = False'
-            node_str = "{} = paddle.jit.dy2static.create_bool_as_type({}, False)".format(
+            node_str = "{} = _jst.create_bool_as_type({}, False)".format(
                 return_name,
                 ast_to_source_code(parent_node_of_return.test).strip())
             assign_false_node = gast.parse(node_str).body[0]
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py
index 7733226cc09f2..d5b23d2f53b1c 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py
@@ -42,7 +42,7 @@ def create_convert_shape_node(var_shape_node,
         if slice_node is not None and slice_is_num(slice_node):
             args.append(ast_to_source_code(slice_node.slice).strip())
 
-        convert_var_shape_func = "paddle.jit.dy2static.convert_var_shape({}, in_control_flow={})".format(
+        convert_var_shape_func = "_jst.convert_var_shape({}, in_control_flow={})".format(
             ",".join(args), in_control_flow)
         api_shape_node = gast.parse(convert_var_shape_func).body[0].value
 
@@ -59,14 +59,14 @@ def create_convert_shape_node(var_shape_node,
 
 
 def create_choose_shape_node(attr_shape_name, api_shape_name, slice_node=None):
-    eval_exist_func = "paddle.jit.dy2static.eval_if_exist_else_none('{}', globals())".format(
+    eval_exist_func = "_jst.eval_if_exist_else_none('{}', globals())".format(
         api_shape_name)
     args = [attr_shape_name, eval_exist_func]
 
     if slice_node is not None and slice_is_num(slice_node):
         args.append(ast_to_source_code(slice_node.slice).strip())
-    choose_shape_func = "paddle.jit.dy2static.choose_shape_attr_or_api({})".format(
-        ",".join(args))
+    choose_shape_func = "_jst.choose_shape_attr_or_api({})".format(",".join(
+        args))
     choose_shape_node = gast.parse(choose_shape_func).body[0].value
     if slice_node is not None and not slice_is_num(slice_node):
         return gast.Subscript(
@@ -84,7 +84,7 @@ class ShapeAttributeTransformer(gast.NodeTransformer):
     def visit_Attribute(self, node):
         if node.attr == 'shape':
             args = ast_to_source_code(node.value).strip()
-            convert_var_shape_func = "paddle.jit.dy2static.convert_var_shape_simple({})".format(
+            convert_var_shape_func = "_jst.convert_var_shape_simple({})".format(
                 args)
             api_shape_node = gast.parse(convert_var_shape_func).body[0].value
             return api_shape_node
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
index d440e387da597..91c2c5dc65aab 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
@@ -185,6 +185,7 @@ def is_api_in_module(node, module_prefix):
         import paddle.fluid as fluid
         import paddle.fluid.dygraph as dygraph
         import paddle.fluid.layers as layers
+        import paddle.jit.dy2static as _jst
 
         from paddle.fluid.dygraph import to_variable
         from paddle import to_tensor
@@ -521,8 +522,8 @@ def remove_if_exit(filepath):
 def _inject_import_statements():
     import_statements = [
         "import paddle", "from paddle import Tensor",
-        "import paddle.fluid as fluid", "from typing import *",
-        "import numpy as np"
+        "import paddle.fluid as fluid", "import paddle.jit.dy2static as _jst",
+        "from typing import *", "import numpy as np"
     ]
     return '\n'.join(import_statements) + '\n'
 
@@ -1168,7 +1169,7 @@ def _build_var_len_assign_node(self):
         else:
             iter_var_name = ast_to_source_code(self.iter_node).strip()
 
-        convert_len_node_source_str = '{} = paddle.jit.dy2static.convert_len({})'.format(
+        convert_len_node_source_str = '{} = _jst.convert_len({})'.format(
             self.iter_var_len_name, iter_var_name)
 
         convert_len_node = gast.parse(convert_len_node_source_str).body[0]
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py
index 2cd6c5e43f7e1..7ce5aede4995d 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py
@@ -77,14 +77,12 @@ def data_layer_not_check(name, shape, dtype='float32', lod_level=0):
 
 
 def to_static_variable_gast_node(name):
-    func_code = "{} = paddle.jit.dy2static.to_static_variable({})".format(name,
-                                                                          name)
+    func_code = "{} = _jst.to_static_variable({})".format(name, name)
     return gast.parse(func_code).body[0]
 
 
 def create_static_variable_gast_node(name):
-    func_code = "{} = paddle.jit.dy2static\
-        .data_layer_not_check(name='{}', shape=[-1], dtype='float32')".format(
+    func_code = "{} = _jst.data_layer_not_check(name='{}', shape=[-1], dtype='float32')".format(
         name, unique_name.generate(name))
     return gast.parse(func_code).body[0]
 
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index a3310f1a46ce4..4d985097088f8 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -3016,9 +3016,15 @@ def __init__(self,
             is_bias=True)
 
     def forward(self, input):
-        if in_dygraph_mode():
+        mean_out = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+        variance_out = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+
+        if _non_static_mode():
             attrs = ('epsilon', self._epsilon, 'groups', self._groups)
-            out, _, _ = _C_ops.group_norm(input, self.weight, self.bias, *attrs)
+            out, _, _ = _C_ops.group_norm(input, self.weight, self.bias,
+                                          mean_out, variance_out, *attrs)
 
             return dygraph_utils._append_activation_in_dygraph(out, self._act)
         else:
@@ -3029,10 +3035,6 @@ def forward(self, input):
                 inputs['Scale'] = self.weight
 
             # create output
-            mean_out = self._helper.create_variable_for_type_inference(
-                dtype=self._dtype, stop_gradient=True)
-            variance_out = self._helper.create_variable_for_type_inference(
-                dtype=self._dtype, stop_gradient=True)
             group_norm_out = self._helper.create_variable_for_type_inference(
                 dtype=self._dtype)
 
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 8049a8b8741b1..add3d73efc7e1 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -101,8 +101,11 @@ def _to_static_var(self, to_parameter=False, **kwargs):
         # Note: getattr(self, attr, None) will call x.grad=x.gradient(), but gradient() only available in dygraph.
         # It will fail. So, for propery that different between dynamic and static graph, should not getattr(self, attr, None).
         attr_not_need_keys = ['grad', 'T', 'place', '_place_str']
+        param_keys = ['stop_gradient', 'trainable']
         if isinstance(self, (ParamBase, EagerParamBase)):
             attr_kwargs = self.__dict__.copy()
+            for key in param_keys:
+                attr_kwargs[key] = getattr(self, key)
         else:
             attr_names = []
             for name in dir(self):
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 6957dd8c5e30c..bd453b3ddaa00 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -3600,6 +3600,10 @@ def append_op(self, *args, **kwargs):
             attrs = kwargs.get("attrs", {})
             inplace_map = kwargs.get("inplace_map", None)
             type = kwargs.get("type", None)
+            warnings.warn(
+                "Op `%s` is executed through `append_op` under the dynamic mode, "
+                "the corresponding API implementation needs to be upgraded to "
+                "using `_C_ops` method." % type, DeprecationWarning)
             op = Operator(
                 block=self,
                 desc=None,
@@ -6619,6 +6623,9 @@ def __init__(self, shape, dtype, **kwargs):
 
         name = kwargs.get('name', unique_name.generate('_eager_param_base'))
 
+        if isinstance(shape, core.eager.Tensor):
+            shape = shape.numpy()
+
         super(EagerParamBase, self).__init__(
             dtype if dtype else core.VarDesc.VarType.FP32,
             list(shape)
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
index 2c09abac9e7ba..51e89cc301cf3 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
@@ -293,12 +293,12 @@ def dag_check_up_and_reorder(program, inputs, outputs):
                 if use_ps_gpu:
                     program.global_block()._insert_op(
                         index=distributed_idx,
-                        type="pull_box_sparse",
+                        type="pull_gpups_sparse",
                         inputs={"Ids": inputs,
                                 'W': w},
                         outputs={"Out": outputs},
                         attrs={
-                            "size": w.shape[1],
+                            "size": [w.shape[1] for i in inputs],
                             "is_distributed": True,
                             "is_sparse": True
                         })
@@ -576,7 +576,7 @@ def _add_push_box_sparse_op(program):
         op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
         backward = core.op_proto_and_checker_maker.OpRole.Backward
         for op in program.global_block().ops:
-            if op.type != "pull_box_sparse":
+            if op.type != "pull_box_sparse" and op.type != "pull_gpups_sparse":
                 continue
             grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
                 op.desc, cpt.to_text(set()), [])
@@ -599,7 +599,7 @@ def _remove_lookup_table_grad_op_and_var(program):
                     lookup_table_grad_var[name] = 1
 
         for idx, op in list(enumerate(program.global_block().ops)):
-            if op.type == "pull_box_sparse":
+            if op.type == "pull_box_sparse" or op.type == "pull_gpups_sparse":
                 continue
             for key_name in op.input_names:
                 for var in op.input(key_name):
diff --git a/python/paddle/fluid/layers/loss.py b/python/paddle/fluid/layers/loss.py
index b78865a0ece4e..99c0a2e70b771 100644
--- a/python/paddle/fluid/layers/loss.py
+++ b/python/paddle/fluid/layers/loss.py
@@ -336,28 +336,7 @@ def square_error_cost(input, label):
             # [0.01, 0.01]
 
     """
-    if _non_static_mode():
-        minus_out = _C_ops.elementwise_sub(input, label)
-        square_out = _C_ops.square(minus_out)
-        return square_out
-
-    check_variable_and_dtype(input, "input", ['float32', 'float64'],
-                             'square_error_cost')
-    check_variable_and_dtype(label, "label", ['float32', 'float64'],
-                             'square_error_cost')
-    helper = LayerHelper('square_error_cost', **locals())
-    minus_out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type='elementwise_sub',
-        inputs={'X': [input],
-                'Y': [label]},
-        outputs={'Out': [minus_out]})
-
-    square_out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type='square', inputs={'X': [minus_out]},
-        outputs={'Out': [square_out]})
-    return square_out
+    return paddle.nn.functional.square_error_cost(input, label)
 
 
 def edit_distance(input,
@@ -433,45 +412,8 @@ def edit_distance(input,
             # [4]
 
     """
-    check_variable_and_dtype(input, 'input', ['int64'], 'edit_distance')
-    check_variable_and_dtype(label, 'label', ['int64'], 'edit_distance')
-    helper = LayerHelper("edit_distance", **locals())
-
-    # remove some tokens from input and labels
-    if ignored_tokens is not None and len(ignored_tokens) > 0:
-        erased_input = helper.create_variable_for_type_inference(dtype="int64")
-        erased_label = helper.create_variable_for_type_inference(dtype="int64")
-
-        helper.append_op(
-            type="sequence_erase",
-            inputs={"X": [input]},
-            outputs={"Out": [erased_input]},
-            attrs={"tokens": ignored_tokens})
-        input = erased_input
-
-        helper.append_op(
-            type="sequence_erase",
-            inputs={"X": [label]},
-            outputs={"Out": [erased_label]},
-            attrs={"tokens": ignored_tokens})
-        label = erased_label
-
-    this_inputs = {"Hyps": [input], "Refs": [label]}
-    if input_length is not None and label_length is not None:
-        this_inputs['HypsLength'] = [input_length]
-        this_inputs['RefsLength'] = [label_length]
-
-    # edit distance op
-    edit_distance_out = helper.create_variable_for_type_inference(dtype="int64")
-    sequence_num = helper.create_variable_for_type_inference(dtype="int64")
-    helper.append_op(
-        type="edit_distance",
-        inputs=this_inputs,
-        outputs={"Out": [edit_distance_out],
-                 "SequenceNum": [sequence_num]},
-        attrs={"normalized": normalized})
-
-    return edit_distance_out, sequence_num
+    return paddle.nn.functional.loss.edit_distance(
+        input, label, normalized, ignored_tokens, input_length, label_length)
 
 
 def warpctc(input,
@@ -1279,52 +1221,9 @@ def softmax_with_cross_entropy(logits,
             out = paddle.nn.functional.softmax_with_cross_entropy(logits=x, label=label)
             print(out)
     """
-    if _non_static_mode():
-        if core.is_compiled_with_npu():
-            softmax, backprop, loss = _C_ops.softmax_with_cross_entropy(
-                logits, label, 'soft_label', soft_label, 'ignore_index',
-                ignore_index, 'numeric_stable_mode', numeric_stable_mode,
-                'axis', axis)
-        else:
-            if in_dygraph_mode():
-                softmax, loss = _C_ops.final_state_cross_entropy_with_softmax(
-                    logits, label, soft_label, True, numeric_stable_mode,
-                    ignore_index, axis)
-            if _in_legacy_dygraph():
-                softmax, loss = _C_ops.softmax_with_cross_entropy(
-                    logits, label, 'soft_label', soft_label, 'ignore_index',
-                    ignore_index, 'numeric_stable_mode', numeric_stable_mode,
-                    'axis', axis)
-        if not return_softmax:
-            return loss
-        else:
-            return loss, softmax
-
-    attrs = {
-        'soft_label': soft_label,
-        'ignore_index': ignore_index,
-        'numeric_stable_mode': numeric_stable_mode,
-        'axis': axis
-    }
-    helper = LayerHelper('softmax_with_cross_entropy', **locals())
-    softmax = helper.create_variable_for_type_inference(dtype=logits.dtype)
-    loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
-
-    outputs = {'Softmax': softmax, 'Loss': loss}
-    if core.is_compiled_with_npu() or core.is_compiled_with_mlu():
-        backprop = helper.create_variable_for_type_inference(dtype=logits.dtype)
-        outputs['Backprop'] = backprop
-    helper.append_op(
-        type='softmax_with_cross_entropy',
-        inputs={'Logits': logits,
-                'Label': label},
-        outputs=outputs,
-        attrs=attrs)
-
-    if return_softmax:
-        return loss, softmax
-
-    return loss
+    return paddle.nn.functional.loss.fluid_softmax_with_cross_entropy(
+        logits, label, soft_label, ignore_index, numeric_stable_mode,
+        return_softmax, axis)
 
 
 def rank_loss(label, left, right, name=None):
@@ -1733,33 +1632,7 @@ def npair_loss(anchor, positive, labels, l2_reg=0.002):
           print(npair_loss)
   
     """
-    check_variable_and_dtype(anchor, 'anchor', ['float32', 'float64'],
-                             'npair_loss')
-    check_variable_and_dtype(positive, 'positive', ['float32', 'float64'],
-                             'positive')
-    check_variable_and_dtype(labels, 'labels', ['float32', 'float64', 'int64'],
-                             'labels')
-    Beta = 0.25
-    batch_size = labels.shape[0]
-
-    labels = nn.reshape(labels, shape=[batch_size, 1])
-    labels = paddle.tile(labels, repeat_times=[1, batch_size])
-
-    labels = equal(labels, nn.transpose(labels, perm=[1, 0])).astype('float32')
-    labels = labels / nn.reduce_sum(labels, dim=1, keep_dim=True)
-
-    l2loss = nn.reduce_mean(nn.reduce_sum(square(anchor), 1)) \
-             + nn.reduce_mean(nn.reduce_sum(square(positive), 1))
-    l2loss = l2loss * Beta * l2_reg
-
-    similarity_matrix = paddle.matmul(
-        anchor, positive, transpose_x=False, transpose_y=True)
-    softmax_ce = softmax_with_cross_entropy(
-        logits=similarity_matrix, label=labels, soft_label=True)
-    cross_entropy = nn.reduce_sum(labels * softmax_ce, 0)
-    celoss = nn.reduce_mean(cross_entropy)
-
-    return l2loss + celoss
+    return paddle.nn.functional.npair_loss(anchor, positive, labels, l2_reg)
 
 
 def mse_loss(input, label):
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 97506ead5fad4..7fb9f6057b55a 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -7394,30 +7394,8 @@ def dice_loss(input, label, epsilon=0.00001, name=None):
             predictions = F.softmax(x)
             loss = F.dice_loss(input=predictions, label=label)
     """
-    assert input.dtype in (paddle.float32, paddle.float64)
-    assert label.dtype in (paddle.int32, paddle.int64)
-    assert len(input.shape) >= 2, \
-        "The rank of input should be greater than or equal to 2."
-    assert len(input.shape) == len(label.shape), (
-        "The rank of input and label should be equal, "
-        "but received input: %d, label: %d." %
-        (len(input.shape), len(label.shape)))
-    assert label.shape[-1] == 1, ("The last dimension of label should be 1, "
-                                  "but received %d." % label.shape[-1])
-    assert input.shape[:-1] == label.shape[:-1], (
-        "All dimensions should be equal except the last one.")
-    assert input.numel() > 0 and label.numel() > 0, \
-        "Any dimension of input and label cannot be equal to 0."
-
-    label = squeeze(label, [-1])
-    label = paddle.nn.functional.one_hot(label, input.shape[-1])
-    reduce_dim = list(range(1, len(input.shape)))
-    inse = reduce_sum(input * label, dim=reduce_dim)
-    dice_denominator = reduce_sum(
-        input, dim=reduce_dim) + reduce_sum(
-            label, dim=reduce_dim)
-    dice_score = 1 - inse * 2 / (dice_denominator + epsilon)
-    return reduce_mean(dice_score)
+    return paddle.nn.functional.dice_loss(
+        input, label, epsilon=epsilon, name=name)
 
 
 def image_resize(input,
@@ -7793,10 +7771,18 @@ def _is_list_or_turple_(data):
     }
 
     if out_shape is not None:
-        if isinstance(out_shape, Variable):
+        if isinstance(out_shape, Variable) and not _non_static_mode():
             out_shape.stop_gradient = True
             inputs['OutSize'] = out_shape
         else:
+            if _non_static_mode():
+                if isinstance(out_shape, Variable):
+                    out_shape = list(out_shape.numpy())
+                else:
+                    out_shape = list(out_shape)
+                for i, dim in enumerate(out_shape):
+                    if isinstance(dim, Variable):
+                        out_shape[i] = dim.numpy()[0]
             if not (_is_list_or_turple_(out_shape)):
                 raise TypeError(
                     "out_shape should be a list or tuple or Variable.")
@@ -7863,7 +7849,9 @@ def _is_list_or_turple_(data):
                     attrs['out_w'] = out_shape[2]
 
     else:
-        if isinstance(scale, Variable):
+        if _non_static_mode() and isinstance(scale, Variable):
+            scale = scale.numpy()
+        elif isinstance(scale, Variable):
             scale.stop_gradient = True
             inputs["Scale"] = scale
         elif isinstance(scale, float) or isinstance(scale, int):
@@ -7883,6 +7871,26 @@ def _is_list_or_turple_(data):
         inputs["OutSize"] = actual_shape
     elif actual_shape is not None:
         raise TypeError("actual_shape should either be Variable or None.")
+
+    if _non_static_mode():
+        attr_list = []
+        for k, v in attrs.items():
+            attr_list.append(k)
+            attr_list.append(v)
+        dy_attr = tuple(attr_list)
+
+        if resample_type == "linear":
+            out = _C_ops.linear_interp(input, actual_shape, *dy_attr)
+        elif resample_type == "bilinear":
+            out = _C_ops.bilinear_interp(input, actual_shape, *dy_attr)
+        elif resample_type == "trilinear":
+            out = _C_ops.trilinear_interp(input, actual_shape, *dy_attr)
+        elif resample_type == "nearest":
+            out = _C_ops.nearest_interp(input, actual_shape, *dy_attr)
+        elif resample_type == "bicubic":
+            out = _C_ops.bicubic_interp(input, actual_shape, *dy_attr)
+        return out
+
     out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type='{}_interp'.format(resample_type),
@@ -13573,22 +13581,7 @@ def log_loss(input, label, epsilon=1e-4, name=None):
           prob = paddle.randn((10,1))
           cost = F.log_loss(input=prob, label=label)
     """
-    if in_dygraph_mode():
-        return _C_ops.final_state_log_loss(input, label, epsilon)
-
-    helper = LayerHelper('log_loss', **locals())
-    check_variable_and_dtype(input, 'input', ['float32'], 'log_loss')
-    check_variable_and_dtype(label, 'label', ['float32'], 'log_loss')
-
-    loss = helper.create_variable_for_type_inference(dtype=input.dtype)
-
-    helper.append_op(
-        type='log_loss',
-        inputs={'Predicted': [input],
-                'Labels': [label]},
-        outputs={'Loss': [loss]},
-        attrs={'epsilon': epsilon})
-    return loss
+    return paddle.nn.functional.log_loss(input, label, epsilon, name)
 
 
 def add_position_encoding(input, alpha, beta, name=None):
@@ -13892,33 +13885,8 @@ def temporal_shift(x, seg_num, shift_ratio=0.25, name=None, data_format="NCHW"):
             input = paddle.randn([6, 4, 2, 2])
             out = F.temporal_shift(x=input, seg_num=2, shift_ratio=0.2)
     """
-    if data_format not in ["NCHW", "NHWC"]:
-        raise ValueError("Attr(data_format) should be 'NCHW' or 'NHWC'. "
-                         "Received Attr(data_format): {}.".format(data_format))
-    if _non_static_mode():
-        return _C_ops.temporal_shift(x, 'seg_num', seg_num, 'shift_ratio',
-                                     shift_ratio, 'data_format', data_format)
-
-    helper = LayerHelper("temporal_shift", **locals())
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'temporal_shift')
-    check_type(seg_num, 'seg_num', int, 'temporal_shift')
-    check_type(shift_ratio, 'shift_ratio', float, 'temporal_shift')
-
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-
-    if not isinstance(seg_num, int):
-        raise TypeError("seg_num must be int type.")
-
-    helper.append_op(
-        type="temporal_shift",
-        inputs={"X": x},
-        outputs={"Out": out},
-        attrs={
-            "seg_num": seg_num,
-            "shift_ratio": shift_ratio,
-            "data_format": data_format
-        })
-    return out
+    return paddle.nn.functional.temporal_shift(x, seg_num, shift_ratio, name,
+                                               data_format)
 
 
 class PyFuncRegistry(object):
@@ -15046,63 +15014,8 @@ def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
             y = F.unfold(x, [3, 3], 1, 1, 1)
     """
 
-    helper = LayerHelper("unfold", **locals())
-
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'unfold')
-
-    assert len(x.shape) == 4, \
-            "input should be the format of [N, C, H, W]"
-
-    if isinstance(kernel_sizes, int):
-        kernel_sizes = [kernel_sizes, kernel_sizes]
-    else:
-        assert isinstance(kernel_sizes, list) and (len(kernel_sizes) == 2), \
-            "kernel_sizes should either be an integer or a list of two integers"
-
-    if isinstance(strides, int):
-        strides = [strides, strides]
-    else:
-        assert isinstance(strides, list) and (len(strides) == 2), \
-            "strides should either be an integer or a list of two integers"
-
-    if isinstance(dilations, int):
-        dilations = [dilations, dilations]
-    else:
-        assert isinstance(dilations, list) and (len(dilations) == 2), \
-            "dilations should either be an integer or a list of two integers"
-
-    if isinstance(paddings, int):
-        paddings = [paddings] * 4
-    elif isinstance(paddings, list):
-        if len(paddings) == 2:
-            paddings = paddings * 2
-        elif len(paddings) == 4:
-            pass
-        else:
-            raise ValueError(
-                "paddings should either be an integer or a list of 2 or 4 integers"
-            )
-    else:
-        raise ValueError(
-            "Unexpected type of paddings, it should be either an integer or a list"
-            "of 2 or 4 integers")
-
-    if in_dygraph_mode():
-        return _C_ops.final_state_unfold(x, kernel_sizes, strides, paddings,
-                                         dilations)
-
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type="unfold",
-        inputs={"X": x},
-        outputs={"Y": out},
-        attrs={
-            "kernel_sizes": kernel_sizes,
-            "strides": strides,
-            "paddings": paddings,
-            "dilations": dilations
-        })
-    return out
+    return paddle.nn.functional.unfold(x, kernel_sizes, strides, paddings,
+                                       dilations, name)
 
 
 def deformable_roi_pooling(input,
@@ -15554,26 +15467,7 @@ def gather_tree(ids, parents):
             # [[[2, 2], [1, 6]], [[3, 3], [6, 1]], [[0, 1], [9, 0]]]
 
     """
-    if in_dygraph_mode():
-        return _C_ops.final_state_gather_tree(ids, parents)
-    else:
-        if _in_legacy_dygraph():
-            return _C_ops.gather_tree(ids, parents)
-        else:
-            helper = LayerHelper('gather_tree', **locals())
-            check_variable_and_dtype(ids, 'ids', ['int32', 'int64'],
-                                     'gather_tree')
-            check_variable_and_dtype(parents, 'parents', ['int32', 'int64'],
-                                     'gather_tree')
-            out = helper.create_variable_for_type_inference(dtype=ids.dtype)
-
-            helper.append_op(
-                type="gather_tree",
-                inputs={"Ids": ids,
-                        "Parents": parents},
-                outputs={"Out": out})
-
-            return out
+    return paddle.nn.functional.gather_tree(ids, parents)
 
 
 @deprecated(since="2.0.0", update_to="paddle.uniform")
diff --git a/python/paddle/fluid/layers/sequence_lod.py b/python/paddle/fluid/layers/sequence_lod.py
index 80dc990af4556..702e38f3d2368 100644
--- a/python/paddle/fluid/layers/sequence_lod.py
+++ b/python/paddle/fluid/layers/sequence_lod.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import paddle
 from .layer_function_generator import templatedoc
 from ..framework import core, Variable, _non_static_mode, in_dygraph_mode, _in_legacy_dygraph, convert_np_dtype_to_dtype_
 from ..layer_helper import LayerHelper
@@ -1382,35 +1383,7 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None):
 
     """
 
-    if in_dygraph_mode():
-        if not isinstance(dtype, core.VarDesc.VarType):
-            dtype = convert_np_dtype_to_dtype_(dtype)
-        if maxlen is not None:
-            if isinstance(maxlen, core.eager.Tensor):
-                attrs = ('out_dtype', dtype)
-                out = _C_ops.sequence_mask(x, maxlen, *attrs)
-            else:
-                attrs = ('out_dtype', dtype, 'maxlen', maxlen)
-                out = _C_ops.sequence_mask(x, None, *attrs)
-            out.stop_gradient = True
-            return out
-
-    helper = LayerHelper('sequence_mask', **locals())
-    out = helper.create_variable_for_type_inference(dtype=dtype)
-
-    inputs = {'X': [x]}
-    attrs = {'out_dtype': out.dtype}
-    if maxlen is not None:
-        if isinstance(maxlen, Variable):
-            inputs['MaxLenTensor'] = maxlen
-        else:
-            attrs['maxlen'] = maxlen
-
-    helper.append_op(
-        type='sequence_mask', inputs=inputs, outputs={'Y': out}, attrs=attrs)
-
-    out.stop_gradient = True
-    return out
+    return paddle.nn.functional.sequence_mask(x, maxlen, dtype, name)
 
 
 @templatedoc()
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index b02c154584e9c..3b1fcc15ab95f 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -681,14 +681,19 @@ def assign(input, output=None):
                              "saving it to file and 'load_op' to load it")
         if output is None:
             output = helper.create_variable_for_type_inference(dtype=dtype)
-        helper.append_op(
-            type='assign_value',
-            outputs={'Out': [output]},
-            attrs={
-                'dtype': dtype,
-                'shape': list(input.shape),
-                value_name: values
-            })
+        if _non_static_mode():
+            _C_ops.assign_value(output, 'shape',
+                                list(input.shape), 'dtype', dtype, value_name,
+                                values)
+        else:
+            helper.append_op(
+                type='assign_value',
+                outputs={'Out': [output]},
+                attrs={
+                    'dtype': dtype,
+                    'shape': list(input.shape),
+                    value_name: values
+                })
 
     if is_inplace and _non_static_mode():
         output._bump_inplace_version()
diff --git a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
index 3cef228d14d6e..d52882acfc9ac 100644
--- a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
+++ b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import os
+import site
 from paddle.fluid import core
 from distutils.sysconfig import get_python_lib
 from distutils.core import setup, Extension
@@ -42,10 +43,11 @@ def build_extensions(self):
     paddle_extra_compile_args += ['-D_GLIBCXX_USE_CXX11_ABI=0']
 
 # include path
-site_packages_path = get_python_lib()
-paddle_custom_kernel_include = [
-    os.path.join(site_packages_path, 'paddle', 'include'),
-]
+site_packages_path = site.getsitepackages()
+paddle_custom_kernel_include = list(
+    map(lambda path: os.path.join(path, 'paddle', 'include'),
+        site_packages_path))
+
 # include path third_party
 compile_third_party_path = os.path.join(os.environ['PADDLE_ROOT'],
                                         'build/third_party')
@@ -56,9 +58,8 @@ def build_extensions(self):
 ]
 
 # libs path
-paddle_custom_kernel_library_dir = [
-    os.path.join(site_packages_path, 'paddle', 'fluid'),
-]
+paddle_custom_kernel_library_dir = list(
+    map(lambda path: os.path.join(path, 'paddle', 'fluid'), site_packages_path))
 
 # libs
 libs = [':core_avx.so']
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 6e80e142c4b85..34237d47a5659 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -131,6 +131,8 @@ if(NOT WITH_GPU)
     LIST(REMOVE_ITEM TEST_OPS test_fused_attention_op_api)
     LIST(REMOVE_ITEM TEST_OPS test_fused_multi_transformer_op)
     LIST(REMOVE_ITEM TEST_OPS test_fused_transformer_encoder_layer)
+    LIST(REMOVE_ITEM TEST_OPS test_fused_bias_dropout_residual_layer_norm_op)
+    LIST(REMOVE_ITEM TEST_OPS test_fused_bias_dropout_residual_layer_norm_op_api)
 endif()
 
 LIST(REMOVE_ITEM TEST_OPS test_fused_gemm_epilogue_op)
@@ -185,6 +187,8 @@ endif()
 # Temporally disable test_deprecated_decorator
 LIST(REMOVE_ITEM TEST_OPS test_deprecated_decorator)
 
+LIST(REMOVE_ITEM TEST_OPS test_tensordot)
+
 if(WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_multiprocess_reader_exception)
     LIST(REMOVE_ITEM TEST_OPS test_trainer_desc)
@@ -325,6 +329,7 @@ if ((NOT WITH_NCCL) AND (NOT WITH_RCCL))
 endif()
 
 if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
+    LIST(REMOVE_ITEM TEST_OPS test_fused_gate_attention_op)
     LIST(REMOVE_ITEM TEST_OPS test_boxps)
 endif()
 list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
@@ -618,22 +623,22 @@ endif()
 py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op ENVS ${GC_ENVS})
 py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op ENVS ${GC_ENVS})
 py_test_modules(test_imperative_resnet MODULES test_imperative_resnet ENVS
-    FLAGS_cudnn_deterministic=1 SERIAL)
+    FLAGS_cudnn_deterministic=1)
 set_tests_properties(test_imperative_resnet PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
 py_test_modules(test_imperative_resnet_sorted_gradient MODULES test_imperative_resnet_sorted_gradient ENVS
-        FLAGS_cudnn_deterministic=1 SERIAL)
+        FLAGS_cudnn_deterministic=1)
 set_tests_properties(test_imperative_resnet_sorted_gradient PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
 py_test_modules(test_imperative_mnist MODULES test_imperative_mnist ENVS
     FLAGS_cudnn_deterministic=1)
 py_test_modules(test_imperative_mnist_sorted_gradient MODULES test_imperative_mnist_sorted_gradient ENVS
         FLAGS_cudnn_deterministic=1)
 py_test_modules(test_imperative_se_resnext MODULES test_imperative_se_resnext ENVS
-    FLAGS_cudnn_deterministic=1 SERIAL)
+    FLAGS_cudnn_deterministic=1)
 set_tests_properties(test_imperative_se_resnext PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
 py_test_modules(test_imperative_ocr_attention_model MODULES test_imperative_ocr_attention_model ENVS
-        FLAGS_cudnn_deterministic=1 SERIAL)
+        FLAGS_cudnn_deterministic=1)
 py_test_modules(test_install_check MODULES test_install_check ENVS
-        FLAGS_cudnn_deterministic=1 SERIAL)
+        FLAGS_cudnn_deterministic=1)
 set_tests_properties(test_install_check PROPERTIES LABELS "RUN_TYPE=DIST")
 py_test_modules(test_imperative_static_runner_mnist MODULES test_imperative_static_runner_mnist ENVS
     FLAGS_cudnn_deterministic=1)
@@ -761,19 +766,19 @@ if(WITH_DISTRIBUTE)
         # port range (20000, 23000) is reserved for dist-ops
         set(dist_ut_port 20001)
         foreach(TEST_OP ${DIST_TEST_OPS})
-            bash_test_modules(${TEST_OP} START_BASH dist_test.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}")
+            bash_test_modules(${TEST_OP} START_BASH dist_test.sh LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}")
             MATH(EXPR dist_ut_port "${dist_ut_port}+20")
             if(dist_ut_port GREATER_EQUAL 22998)
                 message(FATAL_ERROR "available ports have been exhausted:${dist_ut_port}")
             endif()
         endforeach(TEST_OP)
         # solve it later.
-        bash_test_modules(test_fleet_launch_ps START_BASH test_fleet_launch_ps.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} )
+        bash_test_modules(test_fleet_launch_ps START_BASH test_fleet_launch_ps.sh LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} )
         if (WITH_GLOO)
-            bash_test_modules(test_cpuonly_launch START_BASH test_cpuonly_launch.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} )
+            bash_test_modules(test_cpuonly_launch START_BASH test_cpuonly_launch.sh LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} )
         endif()
         if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL)
-            bash_test_modules(test_new_group START_BASH test_new_group.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}+20" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} )
+            bash_test_modules(test_new_group START_BASH test_new_group.sh LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}+20" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} )
         endif()
     endif(NOT APPLE)
 endif()
@@ -1036,7 +1041,7 @@ set_tests_properties(test_imperative_selected_rows_to_lod_tensor PROPERTIES TIME
 set_tests_properties(test_index_select_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_parallel_ssa_graph_inference_feed_partial_data PROPERTIES TIMEOUT 120)
 set_tests_properties(test_parallel_executor_crf PROPERTIES TIMEOUT 120)
-set_tests_properties(test_tensordot PROPERTIES TIMEOUT 200)
+#set_tests_properties(test_tensordot PROPERTIES TIMEOUT 200)
 set_tests_properties(test_imperative_save_load PROPERTIES TIMEOUT 120)
 set_tests_properties(test_partial_eager_deletion_transformer PROPERTIES TIMEOUT 120)
 set_tests_properties(test_parallel_executor_seresnext_with_reduce_gpu PROPERTIES TIMEOUT 120)
@@ -1150,7 +1155,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
     set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120)
     set_tests_properties(test_dygraph_sharding_optimizer_stage2 PROPERTIES TIMEOUT 120)
     set_tests_properties(test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 200)
-    set_tests_properties(test_dygraph_sharding_stage3 PROPERTIES TIMEOUT 200)
+    set_tests_properties(test_dygraph_sharding_stage3 PROPERTIES TIMEOUT 350)
     set_tests_properties(test_dygraph_group_sharded_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_auto_parallel_parallelizer PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120)
@@ -1180,8 +1185,8 @@ endif()
 if((WITH_ROCM OR WITH_GPU) AND NOT WIN32)
     set_tests_properties(test_collective_allgather_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_collective_alltoall_api PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_collective_global_gather PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_collective_global_scatter PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_collective_global_gather PROPERTIES TIMEOUT 200)
+    set_tests_properties(test_collective_global_scatter PROPERTIES TIMEOUT 200)
     set_tests_properties(test_collective_sendrecv_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_collective_broadcast_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_collective_allreduce_api PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt b/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt
index b6b313465ab20..76856d88e1789 100644
--- a/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt
@@ -1,8 +1,8 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
-list(REMOVE_ITEM TEST_OPS "test_fleet_with_asp")
-list(REMOVE_ITEM TEST_OPS "test_fleet_with_asp_amp")
+list(REMOVE_ITEM TEST_OPS "test_fleet_with_asp_static")
+list(REMOVE_ITEM TEST_OPS "test_fleet_with_asp_dynamic")
 list(REMOVE_ITEM TEST_OPS "test_fleet_with_asp_sharding")
 
 foreach(TEST_OP ${TEST_OPS})
@@ -10,9 +10,9 @@ foreach(TEST_OP ${TEST_OPS})
 endforeach(TEST_OP)
 
 if(WITH_DISTRIBUTE)
-    py_test_modules(test_fleet_with_asp MODULES test_fleet_with_asp ENVS ${dist_ENVS})
     if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL)
-        py_test_modules(test_fleet_with_asp_amp MODULES test_fleet_with_asp_amp ENVS ${dist_ENVS})
+        py_test_modules(test_fleet_with_asp_dynamic MODULES test_fleet_with_asp_dynamic ENVS ${dist_ENVS})
+        py_test_modules(test_fleet_with_asp_static MODULES test_fleet_with_asp_static ENVS ${dist_ENVS})
     endif()
 endif()
 
@@ -21,3 +21,8 @@ if((WITH_DISTRIBUTE) AND (NOT WIN32) AND (NOT APPLE))
         py_test_modules(test_fleet_with_asp_sharding MODULES test_fleet_with_asp_sharding ENVS ${dist_ENVS})
     endif()
 endif()
+
+set_tests_properties(test_asp_pruning_dynamic PROPERTIES TIMEOUT 30)
+set_tests_properties(test_asp_pruning_static PROPERTIES TIMEOUT 30)
+set_tests_properties(test_asp_optimize_dynamic PROPERTIES TIMEOUT 30)
+set_tests_properties(test_asp_optimize_static PROPERTIES TIMEOUT 30)
diff --git a/python/paddle/fluid/tests/unittests/asp/asp_pruning_base.py b/python/paddle/fluid/tests/unittests/asp/asp_pruning_base.py
index d41a7b2b842e8..e594bc5c34eb3 100644
--- a/python/paddle/fluid/tests/unittests/asp/asp_pruning_base.py
+++ b/python/paddle/fluid/tests/unittests/asp/asp_pruning_base.py
@@ -20,7 +20,6 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle.static import sparsity
 from paddle.fluid.contrib.sparsity.asp import ASPHelper
 import numpy as np
 
@@ -60,7 +59,7 @@ def run_training_pruning_test(self, get_mask_gen_func, get_mask_check_func):
             loss = fluid.layers.mean(
                 fluid.layers.cross_entropy(
                     input=self.predict, label=self.label))
-            optimizer = sparsity.decorate(
+            optimizer = paddle.incubate.asp.decorate(
                 fluid.optimizer.SGD(learning_rate=0.01))
             optimizer.minimize(loss, self.startup_program)
 
@@ -75,7 +74,7 @@ def run_training_pruning_test(self, get_mask_gen_func, get_mask_check_func):
     def __pruning_and_checking(self, exe, place, mask_func_name,
                                check_func_name, with_mask):
         exe.run(self.startup_program)
-        sparsity.prune_model(
+        paddle.incubate.asp.prune_model(
             self.main_program, mask_algo=mask_func_name, with_mask=with_mask)
         for param in self.main_program.global_block().all_parameters():
             if ASPHelper._is_supported_layer(self.main_program, param.name):
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py b/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py
index a2b499a9e01c3..dca56076dbceb 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py
@@ -66,6 +66,97 @@ def test_add_supported_layer_via_name(self):
             my_own_layer_name in supported_layers_and_prune_func_map)
 
 
+class TestASPDynamicCustomerizedPruneFunc(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+
+        class CustomerLayer(paddle.nn.Layer):
+            def __init__(self):
+                super(CustomerLayer, self).__init__()
+
+                self.weight = self.create_parameter(
+                    shape=[32, 32], attr=None, dtype='float32', is_bias=False)
+                self.linear1 = paddle.nn.Linear(32, 32)
+                self.linear2 = paddle.nn.Linear(32, 10)
+
+            def forward(self, input_):
+                hidden = paddle.nn.functional.linear(
+                    x=input_, weight=self.weight)
+                hidden = self.linear1(hidden)
+                out = self.linear2(hidden)
+                return out
+
+        sparsity.add_supported_layer(CustomerLayer, my_own_pruning)
+
+        self.layer = CustomerLayer()
+        self.customer_prefix = paddle.fluid.dygraph.layers._convert_camel_to_snake(
+            CustomerLayer.__name__)
+        self.supported_layer_count_ref = 3
+
+    def test_inference_pruning(self):
+
+        sparsity.prune_model(self.layer, mask_algo="mask_1d", with_mask=False)
+
+        supported_layer_count = 0
+        for param in self.layer.parameters():
+            mat = param.numpy()
+
+            if sparsity.asp.ASPHelper._is_supported_layer(
+                    paddle.static.default_main_program(), param.name):
+                supported_layer_count += 1
+                if (self.customer_prefix in param.name):
+                    self.assertLessEqual(
+                        np.sum(mat.flatten() - static_tensor.flatten()), 1e-4)
+                else:
+                    self.assertTrue(
+                        sparsity.check_sparsity(
+                            mat.T,
+                            func_name=sparsity.CheckMethod.CHECK_1D,
+                            n=2,
+                            m=4))
+        self.assertEqual(supported_layer_count, self.supported_layer_count_ref)
+
+    def test_training_pruning(self):
+        optimizer = paddle.optimizer.SGD(learning_rate=0.01,
+                                         parameters=self.layer.parameters())
+        optimizer = sparsity.decorate(optimizer)
+
+        sparsity.prune_model(self.layer, mask_algo="mask_1d", with_mask=True)
+
+        supported_layer_count = 0
+        for param in self.layer.parameters():
+            mat = param.numpy()
+
+            if sparsity.asp.ASPHelper._is_supported_layer(
+                    paddle.static.default_main_program(), param.name):
+
+                mat_mask = sparsity.asp.ASPHelper._get_program_asp_info(
+                    paddle.static.default_main_program()).mask_vars[
+                        param.name].numpy()
+
+                supported_layer_count += 1
+                if (self.customer_prefix in param.name):
+                    self.assertLessEqual(
+                        np.sum(mat.flatten() - static_tensor.flatten()), 1e-4)
+                    self.assertLessEqual(
+                        np.sum(mat_mask.flatten() - static_tensor_mask.flatten(
+                        )), 1e-4)
+                else:
+                    self.assertTrue(
+                        sparsity.check_sparsity(
+                            mat.T,
+                            func_name=sparsity.CheckMethod.CHECK_1D,
+                            n=2,
+                            m=4))
+                    self.assertTrue(
+                        sparsity.check_sparsity(
+                            mat_mask.T,
+                            func_name=sparsity.CheckMethod.CHECK_1D,
+                            n=2,
+                            m=4))
+        self.assertEqual(supported_layer_count, self.supported_layer_count_ref)
+
+
 class TestASPStaticCustomerizedPruneFunc(unittest.TestCase):
     def setUp(self):
         paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_optimize_dynamic.py b/python/paddle/fluid/tests/unittests/asp/test_asp_optimize_dynamic.py
new file mode 100644
index 0000000000000..e127dca225116
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_optimize_dynamic.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.contrib.sparsity.asp import ASPHelper
+import numpy as np
+
+
+class MyLayer(paddle.nn.Layer):
+    def __init__(self):
+        super(MyLayer, self).__init__()
+        self.conv1 = paddle.nn.Conv2D(
+            in_channels=3, out_channels=2, kernel_size=3, padding=2)
+        self.linear1 = paddle.nn.Linear(1352, 32)
+        self.linear2 = paddle.nn.Linear(32, 32)
+        self.linear3 = paddle.nn.Linear(32, 10)
+
+    def forward(self, img):
+        hidden = self.conv1(img)
+        hidden = paddle.flatten(hidden, start_axis=1)
+        hidden = self.linear1(hidden)
+        hidden = self.linear2(hidden)
+        prediction = self.linear3(hidden)
+        return prediction
+
+
+class TestASPDynamicOptimize(unittest.TestCase):
+    def setUp(self):
+
+        self.layer = MyLayer()
+
+        self.place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            self.place = paddle.CUDAPlace(0)
+
+        self.optimizer = paddle.optimizer.SGD(
+            learning_rate=0.01, parameters=self.layer.parameters())
+
+    def test_is_supported_layers(self):
+        program = paddle.static.default_main_program()
+
+        names = [
+            'embedding_0.w_0', 'fack_layer_0.w_0', 'conv2d_0.w_0',
+            'conv2d_0.b_0', 'conv2d_1.w_0', 'conv2d_1.b_0', 'fc_0.w_0',
+            'fc_0.b_0', 'fc_1.w_0', 'fc_1.b_0', 'linear_2.w_0', 'linear_2.b_0'
+        ]
+        ref = [
+            False, False, True, False, True, False, True, False, True, False,
+            True, False
+        ]
+        for i, name in enumerate(names):
+            self.assertTrue(
+                ref[i] == ASPHelper._is_supported_layer(program, name))
+
+        paddle.incubate.asp.set_excluded_layers(['fc_1', 'conv2d_0'])
+        ref = [
+            False, False, False, False, True, False, True, False, False, False,
+            True, False
+        ]
+        for i, name in enumerate(names):
+            self.assertTrue(
+                ref[i] == ASPHelper._is_supported_layer(program, name))
+
+        paddle.incubate.asp.reset_excluded_layers()
+        ref = [
+            False, False, True, False, True, False, True, False, True, False,
+            True, False
+        ]
+        for i, name in enumerate(names):
+            self.assertTrue(
+                ref[i] == ASPHelper._is_supported_layer(program, name))
+
+    def test_decorate(self):
+        param_names = [param.name for param in self.layer.parameters()]
+        self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
+
+        program = paddle.static.default_main_program()
+
+        for name in param_names:
+            mask_var = ASPHelper._get_program_asp_info(program).mask_vars.get(
+                name, None)
+            if ASPHelper._is_supported_layer(program, name):
+                self.assertTrue(mask_var is not None)
+            else:
+                self.assertTrue(mask_var is None)
+
+    def test_asp_training(self):
+        self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
+
+        paddle.incubate.asp.prune_model(self.layer)
+
+        imgs = paddle.to_tensor(
+            np.random.randn(32, 3, 24, 24),
+            dtype='float32',
+            place=self.place,
+            stop_gradient=False)
+        labels = paddle.to_tensor(
+            np.random.randint(
+                10, size=(32, 1)),
+            dtype='float32',
+            place=self.place,
+            stop_gradient=False)
+
+        loss_fn = paddle.nn.MSELoss(reduction='mean')
+
+        output = self.layer(imgs)
+        loss = loss_fn(output, labels)
+        loss.backward()
+        self.optimizer.step()
+        self.optimizer.clear_grad()
+
+        for param in self.layer.parameters():
+            if ASPHelper._is_supported_layer(
+                    paddle.static.default_main_program(), param.name):
+                mat = param.numpy()
+                self.assertTrue(
+                    paddle.fluid.contrib.sparsity.check_sparsity(
+                        mat.T, n=2, m=4))
+
+    def test_asp_training_with_amp(self):
+        self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
+
+        paddle.incubate.asp.prune_model(self.layer)
+
+        imgs = paddle.to_tensor(
+            np.random.randn(32, 3, 24, 24),
+            dtype='float32',
+            place=self.place,
+            stop_gradient=False)
+        labels = paddle.to_tensor(
+            np.random.randint(
+                10, size=(32, 1)),
+            dtype='float32',
+            place=self.place,
+            stop_gradient=False)
+
+        loss_fn = paddle.nn.MSELoss(reduction='mean')
+        scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+
+        with paddle.amp.auto_cast(enable=True):
+            output = self.layer(imgs)
+            loss = loss_fn(output, labels)
+        scaled = scaler.scale(loss)
+        scaled.backward()
+        scaler.minimize(self.optimizer, scaled)
+        self.optimizer.clear_grad()
+
+        for param in self.layer.parameters():
+            if ASPHelper._is_supported_layer(
+                    paddle.static.default_main_program(), param.name):
+                mat = param.numpy()
+                self.assertTrue(
+                    paddle.fluid.contrib.sparsity.check_sparsity(
+                        mat.T, n=2, m=4))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_optimize.py b/python/paddle/fluid/tests/unittests/asp/test_asp_optimize_static.py
similarity index 89%
rename from python/paddle/fluid/tests/unittests/asp/test_asp_optimize.py
rename to python/paddle/fluid/tests/unittests/asp/test_asp_optimize_static.py
index 9e5e3c924f1a5..b51e28cdcb9fc 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_optimize.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_optimize_static.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
 # 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -20,21 +20,20 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle.static import sparsity
 from paddle.fluid.contrib.sparsity.asp import ASPHelper
 import numpy as np
 
 paddle.enable_static()
 
 
-class TestASPHelper(unittest.TestCase):
+class TestASPStaticOptimize(unittest.TestCase):
     def setUp(self):
         self.main_program = fluid.Program()
         self.startup_program = fluid.Program()
 
         def build_model():
             img = fluid.data(
-                name='img', shape=[None, 3, 32, 32], dtype='float32')
+                name='img', shape=[None, 3, 24, 24], dtype='float32')
             label = fluid.data(name='label', shape=[None, 1], dtype='int64')
             hidden = fluid.layers.conv2d(
                 input=img, num_filters=4, filter_size=3, padding=2, act="relu")
@@ -87,7 +86,7 @@ def test_is_supported_layers(self):
             self.assertTrue(
                 ref[i] == ASPHelper._is_supported_layer(program, name))
 
-        sparsity.set_excluded_layers(program, ['fc_1', 'conv2d_0'])
+        paddle.incubate.asp.set_excluded_layers(['fc_1', 'conv2d_0'], program)
         ref = [
             False, False, False, False, True, False, True, False, False, False,
             True, False
@@ -96,7 +95,7 @@ def test_is_supported_layers(self):
             self.assertTrue(
                 ref[i] == ASPHelper._is_supported_layer(program, name))
 
-        sparsity.reset_excluded_layers(program)
+        paddle.incubate.asp.reset_excluded_layers(program)
         ref = [
             False, False, True, False, True, False, True, False, True, False,
             True, False
@@ -109,7 +108,7 @@ def test_decorate(self):
         param_names = self.__get_param_names(self.main_program.global_block()
                                              .all_parameters())
         with fluid.program_guard(self.main_program, self.startup_program):
-            self.optimizer = sparsity.decorate(self.optimizer)
+            self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
             self.optimizer.minimize(self.loss, self.startup_program)
         param_names_after_minimize = self.__get_param_names(
             self.main_program.global_block().all_parameters())
@@ -119,7 +118,7 @@ def test_decorate(self):
 
     def test_asp_training(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            self.optimizer = sparsity.decorate(self.optimizer)
+            self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
             self.optimizer.minimize(self.loss, self.startup_program)
 
         place = paddle.CPUPlace()
@@ -129,10 +128,10 @@ def test_asp_training(self):
         feeder = fluid.DataFeeder(feed_list=[self.img, self.label], place=place)
 
         exe.run(self.startup_program)
-        sparsity.prune_model(self.main_program)
+        paddle.incubate.asp.prune_model(self.main_program)
 
-        data = (np.random.randn(64, 3, 32, 32), np.random.randint(
-            10, size=(64, 1)))
+        data = (np.random.randn(32, 3, 24, 24), np.random.randint(
+            10, size=(32, 1)))
         exe.run(self.main_program, feed=feeder.feed([data]))
 
         for param in self.main_program.global_block().all_parameters():
@@ -149,7 +148,7 @@ def test_asp_training_with_amp(self):
             with fluid.program_guard(self.main_program, self.startup_program):
                 self.optimizer = fluid.contrib.mixed_precision.decorator.decorate(
                     self.optimizer)
-                self.optimizer = sparsity.decorate(self.optimizer)
+                self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
                 self.optimizer.minimize(self.loss, self.startup_program)
 
             exe = fluid.Executor(place)
@@ -157,10 +156,10 @@ def test_asp_training_with_amp(self):
                 feed_list=[self.img, self.label], place=place)
 
             exe.run(self.startup_program)
-            sparsity.prune_model(self.main_program)
+            paddle.incubate.asp.prune_model(self.main_program)
 
-            data = (np.random.randn(64, 3, 32, 32), np.random.randint(
-                10, size=(64, 1)))
+            data = (np.random.randn(32, 3, 24, 24), np.random.randint(
+                10, size=(32, 1)))
             exe.run(self.main_program, feed=feeder.feed([data]))
 
             for param in self.main_program.global_block().all_parameters():
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_best.py b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_best.py
deleted file mode 100644
index e99509187038c..0000000000000
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_best.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle
-import unittest
-from paddle.static import sparsity
-from paddle.fluid.tests.unittests.asp.asp_pruning_base import TestASPHelperPruningBase
-
-paddle.enable_static()
-
-
-class TestASPHelperPruning2DBest(TestASPHelperPruningBase):
-    def test_2D_best_inference_pruning(self):
-        self.run_inference_pruning_test(
-            'mask_2d_best', paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D)
-
-    def test_2D_best_training_pruning(self):
-        self.run_training_pruning_test(
-            'mask_2d_best', paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_greedy.py b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_greedy.py
deleted file mode 100644
index 7ad6c3ae02275..0000000000000
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_greedy.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle
-from paddle.static import sparsity
-from paddle.fluid.tests.unittests.asp.asp_pruning_base import TestASPHelperPruningBase
-
-paddle.enable_static()
-
-
-class TestASPHelperPruning2DGreedy(TestASPHelperPruningBase):
-    def test_2D_greedy_inference_pruning(self):
-        self.run_inference_pruning_test(
-            'mask_2d_greedy',
-            paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D)
-
-    def test_2D_greedy_training_pruning(self):
-        self.run_training_pruning_test(
-            'mask_2d_greedy',
-            paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_dynamic.py b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_dynamic.py
new file mode 100644
index 0000000000000..b0fad0b64002a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_dynamic.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+from paddle.fluid import core
+from paddle.fluid.contrib.sparsity.asp import ASPHelper
+
+
+class MyLayer(paddle.nn.Layer):
+    def __init__(self):
+        super(MyLayer, self).__init__()
+        self.conv1 = paddle.nn.Conv2D(
+            in_channels=3, out_channels=2, kernel_size=3, padding=2)
+        self.linear1 = paddle.nn.Linear(1352, 32)
+        self.linear2 = paddle.nn.Linear(32, 10)
+
+    def forward(self, img):
+        hidden = self.conv1(img)
+        hidden = paddle.flatten(hidden, start_axis=1)
+        hidden = self.linear1(hidden)
+        prediction = self.linear2(hidden)
+        return prediction
+
+
+class TestASPDynamicPruningBase(unittest.TestCase):
+    def setUp(self):
+        self.layer = MyLayer()
+
+        place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+
+        self.img = paddle.to_tensor(
+            np.random.uniform(
+                low=-0.5, high=0.5, size=(32, 3, 24, 24)),
+            dtype=np.float32,
+            place=place,
+            stop_gradient=False)
+
+        self.set_config()
+
+    def set_config(self):
+        self.mask_gen_func = 'mask_1d'
+        self.mask_check_func = paddle.fluid.contrib.sparsity.CheckMethod.CHECK_1D
+
+    def test_inference_pruning(self):
+        self.__pruning_and_checking(False)
+
+    def test_training_pruning(self):
+
+        optimizer = paddle.optimizer.SGD(learning_rate=0.01,
+                                         parameters=self.layer.parameters())
+        optimizer = paddle.incubate.asp.decorate(optimizer)
+
+        self.__pruning_and_checking(True)
+
+    def __pruning_and_checking(self, with_mask):
+
+        paddle.incubate.asp.prune_model(
+            self.layer, mask_algo=self.mask_gen_func, with_mask=with_mask)
+
+        for param in self.layer.parameters():
+            if ASPHelper._is_supported_layer(
+                    paddle.static.default_main_program(), param.name):
+                mat = param.numpy()
+                self.assertTrue(
+                    paddle.fluid.contrib.sparsity.check_sparsity(
+                        mat.T, func_name=self.mask_check_func, n=2, m=4))
+
+
+class TestASPDynamicPruning1D(TestASPDynamicPruningBase):
+    def set_config(self):
+        self.mask_gen_func = 'mask_1d'
+        self.mask_check_func = paddle.fluid.contrib.sparsity.CheckMethod.CHECK_1D
+
+
+class TestASPDynamicPruning2DBest(TestASPDynamicPruningBase):
+    def set_config(self):
+        self.mask_gen_func = 'mask_2d_best'
+        self.mask_check_func = paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D
+
+
+class TestASPDynamicPruning2DGreedy(TestASPDynamicPruningBase):
+    def set_config(self):
+        self.mask_gen_func = 'mask_2d_greedy'
+        self.mask_check_func = paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_static.py b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_static.py
new file mode 100644
index 0000000000000..a9986f24b0265
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_static.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import threading, time
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.contrib.sparsity.asp import ASPHelper
+import numpy as np
+
+paddle.enable_static()
+
+
+class TestASPStaticPruningBase(unittest.TestCase):
+    def setUp(self):
+        self.main_program = fluid.Program()
+        self.startup_program = fluid.Program()
+
+        def build_model():
+            img = fluid.data(
+                name='img', shape=[None, 3, 24, 24], dtype='float32')
+            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+            hidden = fluid.layers.conv2d(
+                input=img, num_filters=2, filter_size=3, padding=2, act="relu")
+            hidden = fluid.layers.fc(input=hidden, size=32, act='softmax')
+            prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+            return img, label, prediction
+
+        with fluid.program_guard(self.main_program, self.startup_program):
+            self.img, self.label, self.predict = build_model()
+
+        self.set_config()
+
+    def set_config(self):
+        self.mask_gen_func = 'mask_1d'
+        self.mask_check_func = paddle.fluid.contrib.sparsity.CheckMethod.CHECK_1D
+
+    def test_inference_pruning(self):
+        place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+        exe = fluid.Executor(place)
+
+        self.__pruning_and_checking(exe, place, False)
+
+    def test_training_pruning(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            loss = fluid.layers.mean(
+                fluid.layers.cross_entropy(
+                    input=self.predict, label=self.label))
+            optimizer = paddle.incubate.asp.decorate(
+                fluid.optimizer.SGD(learning_rate=0.01))
+            optimizer.minimize(loss, self.startup_program)
+
+        place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+        exe = fluid.Executor(place)
+
+        self.__pruning_and_checking(exe, place, True)
+
+    def __pruning_and_checking(self, exe, place, with_mask):
+        exe.run(self.startup_program)
+        paddle.incubate.asp.prune_model(
+            self.main_program,
+            mask_algo=self.mask_gen_func,
+            with_mask=with_mask)
+        for param in self.main_program.global_block().all_parameters():
+            if ASPHelper._is_supported_layer(self.main_program, param.name):
+                mat = np.array(fluid.global_scope().find_var(param.name)
+                               .get_tensor())
+                self.assertTrue(
+                    paddle.fluid.contrib.sparsity.check_sparsity(
+                        mat.T, func_name=self.mask_check_func, n=2, m=4))
+
+
+class TestASPStaticPruning1D(TestASPStaticPruningBase):
+    def set_config(self):
+        self.mask_gen_func = 'mask_1d'
+        self.mask_check_func = paddle.fluid.contrib.sparsity.CheckMethod.CHECK_1D
+
+
+class TestASPStaticPruning2DBest(TestASPStaticPruningBase):
+    def set_config(self):
+        self.mask_gen_func = 'mask_2d_best'
+        self.mask_check_func = paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D
+
+
+class TestASPStaticPruning2DGreedy(TestASPStaticPruningBase):
+    def set_config(self):
+        self.mask_gen_func = 'mask_2d_greedy'
+        self.mask_check_func = paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_save_load.py b/python/paddle/fluid/tests/unittests/asp/test_asp_save_load.py
new file mode 100644
index 0000000000000..653cbbf84091b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_save_load.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.contrib.sparsity.asp import ASPHelper
+import numpy as np
+
+
+class MyLayer(paddle.nn.Layer):
+    def __init__(self):
+        super(MyLayer, self).__init__()
+        self.conv1 = paddle.nn.Conv2D(
+            in_channels=3, out_channels=4, kernel_size=3, padding=2)
+        self.linear1 = paddle.nn.Linear(4624, 32)
+        self.linear2 = paddle.nn.Linear(32, 32)
+        self.linear3 = paddle.nn.Linear(32, 10)
+
+    def forward(self, img):
+        hidden = self.conv1(img)
+        hidden = paddle.flatten(hidden, start_axis=1)
+        hidden = self.linear1(hidden)
+        hidden = self.linear2(hidden)
+        prediction = self.linear3(hidden)
+        return prediction
+
+
+class TestASPDynamicOptimize(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+
+        self.layer = MyLayer()
+
+        self.place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            self.place = paddle.CUDAPlace(0)
+
+        self.optimizer = paddle.optimizer.SGD(
+            learning_rate=0.01, parameters=self.layer.parameters())
+        self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
+        paddle.incubate.asp.prune_model(self.layer)
+
+    def test_save_and_load(self):
+        path = "/tmp/paddle_asp_save_dy/"
+        net_path = path + "asp_net.pdparams"
+        opt_path = path + "asp_opt.pdopt"
+
+        paddle.save(self.layer.state_dict(), net_path)
+        paddle.save(self.optimizer.state_dict(), opt_path)
+
+        asp_info = ASPHelper._get_program_asp_info(
+            paddle.static.default_main_program())
+        for param_name in asp_info.mask_vars:
+            mask = asp_info.mask_vars[param_name]
+            asp_info.update_mask_vars(
+                param_name, paddle.ones(
+                    shape=mask.shape, dtype=mask.dtype))
+            asp_info.update_masks(param_name, np.ones(shape=mask.shape))
+
+        net_state_dict = paddle.load(net_path)
+        opt_state_dict = paddle.load(opt_path)
+
+        self.layer.set_state_dict(net_state_dict)
+        self.optimizer.set_state_dict(opt_state_dict)
+
+        imgs = paddle.to_tensor(
+            np.random.randn(64, 3, 32, 32),
+            dtype='float32',
+            place=self.place,
+            stop_gradient=False)
+        labels = paddle.to_tensor(
+            np.random.randint(
+                10, size=(64, 1)),
+            dtype='float32',
+            place=self.place,
+            stop_gradient=False)
+
+        loss_fn = paddle.nn.MSELoss(reduction='mean')
+
+        output = self.layer(imgs)
+        loss = loss_fn(output, labels)
+        loss.backward()
+        self.optimizer.step()
+        self.optimizer.clear_grad()
+
+        for param in self.layer.parameters():
+            if ASPHelper._is_supported_layer(
+                    paddle.static.default_main_program(), param.name):
+                mat = param.numpy()
+                self.assertTrue(
+                    paddle.fluid.contrib.sparsity.check_sparsity(
+                        mat.T, n=2, m=4))
+
+
+class TestASPStaticOptimize(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+
+        self.main_program = fluid.Program()
+        self.startup_program = fluid.Program()
+
+        def build_model():
+            img = fluid.data(
+                name='img', shape=[None, 3, 32, 32], dtype='float32')
+            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+            hidden = fluid.layers.conv2d(
+                input=img, num_filters=4, filter_size=3, padding=2, act="relu")
+            hidden = fluid.layers.fc(input=hidden, size=32, act='relu')
+            prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+            return img, label, prediction
+
+        with fluid.program_guard(self.main_program, self.startup_program):
+            self.img, self.label, predict = build_model()
+            self.loss = fluid.layers.mean(
+                fluid.layers.cross_entropy(
+                    input=predict, label=self.label))
+            self.optimizer = fluid.optimizer.SGD(learning_rate=0.01)
+            self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
+            self.optimizer.minimize(self.loss, self.startup_program)
+
+        self.place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            self.place = paddle.CUDAPlace(0)
+        self.exe = fluid.Executor(self.place)
+        self.exe.run(self.startup_program)
+
+        paddle.incubate.asp.prune_model(self.main_program)
+
+    def test_save_and_load(self):
+        path = "/tmp/paddle_asp_save_st/"
+        param_path = path + "asp.pdparams"
+        model_path = path + "asp.pdmodel"
+
+        paddle.save(self.main_program.state_dict(), param_path)
+        paddle.save(self.main_program, model_path)
+
+        prog = paddle.load(model_path)
+
+        state_dict = paddle.load(param_path)
+        prog.set_state_dict(state_dict)
+
+        feeder = fluid.DataFeeder(
+            feed_list=[self.img, self.label], place=self.place)
+
+        data = (np.random.randn(64, 3, 32, 32), np.random.randint(
+            10, size=(64, 1)))
+        self.exe.run(prog, feed=feeder.feed([data]))
+
+        for param in prog.global_block().all_parameters():
+            if ASPHelper._is_supported_layer(prog, param.name):
+                mat = np.array(fluid.global_scope().find_var(param.name)
+                               .get_tensor())
+                self.assertTrue(
+                    paddle.fluid.contrib.sparsity.check_sparsity(
+                        mat.T, n=2, m=4))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_utils.py b/python/paddle/fluid/tests/unittests/asp/test_asp_utils.py
index 4aac878763b6f..67ec54367d382 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_utils.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_utils.py
@@ -18,7 +18,6 @@
 import unittest
 import threading, time
 import paddle
-from paddle.static import sparsity
 import numpy as np
 
 
@@ -41,9 +40,9 @@ def test_density(self):
         x = np.array([[1.0, 1.0, 1.0, 0.0, 1.0], [1.0, 1.0, 0.0, 0.0, 1.0],
                       [1.0, 0.0, 0.0, 0.0, 1.0], [1.0, 1.0, 0.0, 0.0, 1.0],
                       [0.0, 1.0, 0.0, 0.0, 1.0]])
-        self.assertEqual(sparsity.calculate_density(x), 0.56)
+        self.assertEqual(paddle.incubate.asp.calculate_density(x), 0.56)
         x[:, 0] = 0.0
-        self.assertEqual(sparsity.calculate_density(x), 0.4)
+        self.assertEqual(paddle.incubate.asp.calculate_density(x), 0.4)
 
     def test_check_mask_1d(self):
         x = np.array([[1.0, 0.0, 0.0, 1.0, 1.0], [1.0, 1.0, 0.0, 0.0, 1.0],
@@ -219,3 +218,7 @@ def __test_1D_2D_sparse_mask_generation_methods(self, x):
                 func_name=paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D,
                 n=2,
                 m=4))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp.py b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp.py
deleted file mode 100644
index 074aedb947613..0000000000000
--- a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.distributed.fleet as fleet
-import paddle.distributed.fleet.base.role_maker as role_maker
-import unittest
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import os
-from paddle.static import sparsity
-from paddle.fluid.contrib.sparsity.asp import ASPHelper
-import numpy as np
-cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES')
-if cuda_visible_devices is None or cuda_visible_devices == "":
-    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
-else:
-    os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices.split(',')[0]
-
-paddle.enable_static()
-
-
-class TestFleetWithASP(unittest.TestCase):
-    def setUp(self):
-        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213"
-        os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213"
-        os.environ["PADDLE_TRAINERS_NUM"] = "1"
-        os.environ["PADDLE_TRAINER_ID"] = "0"
-
-    def net(self, main_prog, startup_prog):
-        with fluid.program_guard(main_prog, startup_prog):
-            input_x = paddle.static.data(
-                name="x", shape=[-1, 32], dtype='float32')
-            input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
-
-            fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh')
-            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
-            cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
-            avg_cost = paddle.mean(x=cost)
-
-            strategy = paddle.distributed.fleet.DistributedStrategy()
-            strategy.asp = True
-        return avg_cost, strategy, input_x, input_y
-
-    def test_with_asp(self):
-        fleet.init(is_collective=True)
-        train_prog, startup_prog = fluid.Program(), fluid.Program()
-        avg_cost, strategy, input_x, input_y = self.net(train_prog,
-                                                        startup_prog)
-
-        with fluid.program_guard(train_prog, startup_prog):
-            optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
-            optimizer = fleet.distributed_optimizer(
-                optimizer, strategy=strategy)
-            optimizer.minimize(avg_cost)
-
-        place = fluid.CUDAPlace(0) if paddle.fluid.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
-
-        exe = fluid.Executor(place)
-        feeder = fluid.DataFeeder(feed_list=[input_x, input_y], place=place)
-        exe.run(startup_prog)
-
-        sparsity.prune_model(train_prog)
-
-        data = (np.random.randn(64, 32), np.random.randint(2, size=(64, 1)))
-        exe.run(train_prog, feed=feeder.feed([data]))
-
-        for param in train_prog.global_block().all_parameters():
-            if ASPHelper._is_supported_layer(train_prog, param.name):
-                mat = np.array(fluid.global_scope().find_var(param.name)
-                               .get_tensor())
-                self.assertTrue(
-                    paddle.fluid.contrib.sparsity.check_sparsity(
-                        mat.T, n=2, m=4))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_dynamic.py b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_dynamic.py
new file mode 100644
index 0000000000000..3ced15bf15881
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_dynamic.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
+import unittest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import os
+from paddle.fluid.contrib.sparsity.asp import ASPHelper
+import numpy as np
+cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES')
+if cuda_visible_devices is None or cuda_visible_devices == "":
+    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+else:
+    os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices.split(',')[0]
+
+
+class MyLayer(paddle.nn.Layer):
+    def __init__(self):
+        super(MyLayer, self).__init__()
+        self.linear1 = paddle.nn.Linear(32, 32)
+        self.linear2 = paddle.nn.Linear(32, 10)
+
+    def forward(self, x):
+        hidden = self.linear1(x)
+        prediction = self.linear2(hidden)
+        return prediction
+
+
+class TestFleetWithASPDynamic(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213"
+        os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213"
+        os.environ["PADDLE_TRAINERS_NUM"] = "1"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+
+        self.layer = MyLayer()
+
+        self.place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            self.place = paddle.CUDAPlace(0)
+
+        self.optimizer = paddle.optimizer.SGD(
+            learning_rate=0.01, parameters=self.layer.parameters())
+
+    def test_with_asp(self):
+        fleet.init(is_collective=True)
+
+        self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
+        paddle.incubate.asp.prune_model(self.layer)
+
+        self.optimizer = fleet.distributed_optimizer(self.optimizer)
+        self.layer = fleet.distributed_model(self.layer)
+
+        imgs = paddle.to_tensor(
+            np.random.randn(64, 32),
+            dtype='float32',
+            place=self.place,
+            stop_gradient=False)
+        labels = paddle.to_tensor(
+            np.random.randint(
+                10, size=(64, 1)),
+            dtype='float32',
+            place=self.place,
+            stop_gradient=False)
+
+        loss_fn = paddle.nn.MSELoss(reduction='mean')
+
+        output = self.layer(imgs)
+        loss = loss_fn(output, labels)
+        loss.backward()
+        self.optimizer.step()
+        self.optimizer.clear_grad()
+
+        for param in self.layer.parameters():
+            if ASPHelper._is_supported_layer(
+                    paddle.static.default_main_program(), param.name):
+                mat = param.numpy()
+                self.assertTrue(
+                    paddle.fluid.contrib.sparsity.check_sparsity(
+                        mat.T, n=2, m=4))
+
+
+class TestFleetWithASPAMPDynamic(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213"
+        os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213"
+        os.environ["PADDLE_TRAINERS_NUM"] = "1"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+
+        self.layer = MyLayer()
+
+        self.place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            self.place = paddle.CUDAPlace(0)
+
+        self.optimizer = paddle.optimizer.SGD(
+            learning_rate=0.01, parameters=self.layer.parameters())
+
+    def test_with_asp(self):
+        fleet.init(is_collective=True)
+
+        self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
+        paddle.incubate.asp.prune_model(self.layer)
+
+        self.optimizer = fleet.distributed_optimizer(self.optimizer)
+        self.layer = fleet.distributed_model(self.layer)
+
+        imgs = paddle.to_tensor(
+            np.random.randn(64, 32),
+            dtype='float32',
+            place=self.place,
+            stop_gradient=False)
+        labels = paddle.to_tensor(
+            np.random.randint(
+                10, size=(64, 1)),
+            dtype='float32',
+            place=self.place,
+            stop_gradient=False)
+
+        loss_fn = paddle.nn.MSELoss(reduction='mean')
+        scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+
+        with paddle.amp.auto_cast(enable=True):
+            output = self.layer(imgs)
+            loss = loss_fn(output, labels)
+        scaled = scaler.scale(loss)
+        scaled.backward()
+        scaler.minimize(self.optimizer, scaled)
+        self.optimizer.clear_grad()
+
+        for param in self.layer.parameters():
+            if ASPHelper._is_supported_layer(
+                    paddle.static.default_main_program(), param.name):
+                mat = param.numpy()
+                self.assertTrue(
+                    paddle.fluid.contrib.sparsity.check_sparsity(
+                        mat.T, n=2, m=4))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_amp.py b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_static.py
similarity index 67%
rename from python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_amp.py
rename to python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_static.py
index a34d7e69872e2..2023c0051401f 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_amp.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_static.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -32,7 +32,62 @@
 paddle.enable_static()
 
 
-class TestFleetWithASP(unittest.TestCase):
+class TestFleetWithASPStatic(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213"
+        os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213"
+        os.environ["PADDLE_TRAINERS_NUM"] = "1"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+
+    def net(self, main_prog, startup_prog):
+        with fluid.program_guard(main_prog, startup_prog):
+            input_x = paddle.static.data(
+                name="x", shape=[-1, 32], dtype='float32')
+            input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
+
+            fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh')
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+            cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
+            avg_cost = paddle.mean(x=cost)
+
+            strategy = paddle.distributed.fleet.DistributedStrategy()
+            strategy.asp = True
+        return avg_cost, strategy, input_x, input_y
+
+    def test_with_asp(self):
+        fleet.init(is_collective=True)
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy, input_x, input_y = self.net(train_prog,
+                                                        startup_prog)
+
+        with fluid.program_guard(train_prog, startup_prog):
+            optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+            optimizer = fleet.distributed_optimizer(
+                optimizer, strategy=strategy)
+            optimizer.minimize(avg_cost)
+
+        place = fluid.CUDAPlace(0) if paddle.fluid.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+
+        exe = fluid.Executor(place)
+        feeder = fluid.DataFeeder(feed_list=[input_x, input_y], place=place)
+        exe.run(startup_prog)
+
+        sparsity.prune_model(train_prog)
+
+        data = (np.random.randn(64, 32), np.random.randint(2, size=(64, 1)))
+        exe.run(train_prog, feed=feeder.feed([data]))
+
+        for param in train_prog.global_block().all_parameters():
+            if ASPHelper._is_supported_layer(train_prog, param.name):
+                mat = np.array(fluid.global_scope().find_var(param.name)
+                               .get_tensor())
+                self.assertTrue(
+                    paddle.fluid.contrib.sparsity.check_sparsity(
+                        mat.T, n=2, m=4))
+
+
+class TestFleetWithASPAMPStatic(unittest.TestCase):
     def setUp(self):
         os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213"
         os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213"
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
index 346939fb5ce28..381461130ed5c 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
@@ -31,4 +31,5 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
     py_test_modules(test_cluster MODULES test_cluster ENVS ${dist_ENVS})
     py_test_modules(test_comm_cost MODULES test_comm_cost ENVS ${dist_ENVS})
     py_test_modules(test_comp_cost MODULES test_comp_cost ENVS ${dist_ENVS})
+    py_test_modules(test_dist_context MODULES test_dist_context ENVS ${dist_ENVS})
 endif()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_comp_cost.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_comp_cost.py
index 4cdd51e42adf0..af7a44b5aaa23 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_comp_cost.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_comp_cost.py
@@ -54,6 +54,35 @@
 from paddle.distributed.auto_parallel.cost.comp_op_cost import MatmulOpCost
 from paddle.distributed.auto_parallel.cost.comp_op_cost import MatmulGradOpCost
 from paddle.distributed.auto_parallel.cost.comp_op_cost import MatmulV2OpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import MatmulV2GradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import MemcpyOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import MulOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import MulGradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import OneHotOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import ReadFromArrayOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import ReduceSumOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import ReduceSumGradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import Reshape2OpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import Reshape2GradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import ReduceMeanOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import ReduceMeanGradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import SamplingIdOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import ScaleOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import SliceOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import SoftmaxOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import SoftmaxGradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import SoftmaxWithCrossEntropyOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import SoftmaxWithCrossEntropyGradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import SplitOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import Squeeze2OpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import SquareOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import SquareGradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import SumOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import TopKOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import Transpose2OpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import Transpose2GradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import Unsqueeze2OpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import WriteToArrayOpCost
 
 from test_cluster import cluster_json
 
@@ -244,6 +273,155 @@ def test_comp_cost(self):
         self.assertTrue(op_cost.time >= 0)
         self.assertTrue(op_cost.memory >= 0)
 
+        op_cost = MatmulV2GradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = MemcpyOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = MulOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = MulGradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = OneHotOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = ReadFromArrayOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = ReduceSumOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = ReduceSumGradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = Reshape2OpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = MatmulV2OpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = Reshape2GradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = ReduceMeanOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = ReduceMeanGradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = SamplingIdOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = ScaleOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = SliceOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = SoftmaxOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = SoftmaxGradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = SoftmaxWithCrossEntropyOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = SoftmaxWithCrossEntropyGradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = SplitOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = Squeeze2OpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = SquareOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = SquareGradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = SumOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = TopKOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = Transpose2OpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = Transpose2GradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = Unsqueeze2OpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = WriteToArrayOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
         # Remove unnecessary files
         if os.path.exists(cluster_json_path):
             os.remove(cluster_json_path)
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_context.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_context.py
new file mode 100644
index 0000000000000..f7718e584f5e1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_context.py
@@ -0,0 +1,204 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import json
+
+import paddle
+import numpy as np
+import paddle.nn as nn
+import paddle.utils as utils
+import paddle.static as static
+import paddle.nn.functional as F
+
+from paddle.distributed import fleet
+import paddle.distributed.auto_parallel as auto
+from paddle.distributed.auto_parallel.dist_context import DistributedContext
+from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
+
+paddle.enable_static()
+
+batch_size = 4
+hidden_size = 1024
+sequence_len = 512
+_g_process_mesh = [[0, 1], [2, 3]]
+
+
+def get_random_inputs_and_labels(input_shape, label_shape):
+    input = np.random.random(size=input_shape).astype('float32')
+    label = np.random.random(size=label_shape).astype('float32')
+    return input, label
+
+
+def batch_generator_creator():
+    def __reader__():
+        for _ in range(batch_size):
+            batch_input, batch_label = get_random_inputs_and_labels(
+                [batch_size, sequence_len, hidden_size],
+                [batch_size, sequence_len, 1])
+            yield batch_input, batch_label
+
+    return __reader__
+
+
+class MLPLayer(nn.Layer):
+    def __init__(self,
+                 hidden_size=1024,
+                 intermediate_size=4 * 1024,
+                 dropout_ratio=0.1,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        param_initializer = nn.initializer.Normal(
+            mean=0.0, std=initializer_range)
+
+        self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
+        self.linear0 = nn.Linear(
+            d_model,
+            dim_feedforward,
+            weight_attr=paddle.ParamAttr(initializer=param_initializer),
+            bias_attr=None)
+        self.linear1 = nn.Linear(
+            dim_feedforward,
+            d_model,
+            weight_attr=paddle.ParamAttr(initializer=param_initializer),
+            bias_attr=None)
+
+    def forward(self, input):
+        out = self.norm(input)
+        auto.shard_tensor(
+            self.linear0.weight,
+            dist_attr={
+                "process_mesh": _g_process_mesh[0],
+                "dims_mapping": [-1, 0]
+            })
+        out = self.linear0(out)
+        out = F.gelu(out, approximate=True)
+        auto.shard_tensor(
+            self.linear1.weight,
+            dist_attr={
+                "process_mesh": _g_process_mesh[1],
+                "dims_mapping": [0, -1]
+            })
+        out = self.linear1(out)
+
+        return out
+
+
+def get_program():
+    dist_strategy = fleet.DistributedStrategy()
+    dist_strategy.semi_auto = True
+    # fleet.init(is_collective=True, strategy=dist_strategy)
+
+    train_program = static.Program()
+    start_program = static.Program()
+    with static.program_guard(train_program, start_program):
+        # input
+        input = static.data(
+            name="input",
+            shape=[batch_size, sequence_len, hidden_size],
+            dtype='float32')
+        label = static.data(
+            name="label", shape=[batch_size, sequence_len, 1], dtype='float32')
+        data_holder = [input, label]
+        # dataloader
+        dataloader = paddle.io.DataLoader.from_generator(
+            feed_list=data_holder, capacity=4 * batch_size, iterable=False)
+        dataloader.set_batch_generator(
+            batch_generator_creator(), places=paddle.static.cuda_places())
+        # data dist_attr
+        auto.shard_tensor(
+            input,
+            dist_attr={
+                "process_mesh": _g_process_mesh[0],
+                "dims_mapping": [0, -1, -1]
+            })
+        auto.shard_tensor(
+            label,
+            dist_attr={
+                "process_mesh": _g_process_mesh[0],
+                "dims_mapping": [0, -1, -1]
+            })
+
+        mlp_start = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            dropout_ratio=0.1,
+            initializer_range=0.02)
+        pred = mlp_start(input)
+
+        mlp_mid = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            dropout_ratio=0.1,
+            initializer_range=0.02)
+        pred = mlp_mid(pred)
+
+        mlp_end = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            dropout_ratio=0.1,
+            initializer_range=0.02)
+        pred = mlp_end(pred)
+
+        error_cost = paddle.nn.functional.square_error_cost(pred, label)
+        loss = paddle.mean(error_cost)
+
+        optimizer = paddle.optimizer.Adam(
+            learning_rate=0.00001,
+            beta1=0.9,
+            beta2=0.999,
+            epsilon=1e-08,
+            grad_clip=None)
+
+        feed_vars = {"inputs": [input], "labels": [label]}
+        fetch_vars = {"loss": [loss]}
+
+    return train_program, start_program, dataloader, loss, optimizer, feed_vars, fetch_vars
+
+
+class TestDistributedContext(unittest.TestCase):
+    def test_backup_restore(self):
+        train_program, start_program, dataloader, loss, optimizer, feed_vars, fetch_vars = get_program(
+        )
+        dist_context = DistributedContext(train_program, start_program,
+                                          optimizer, loss, feed_vars,
+                                          fetch_vars)
+        dist_context.initialize()
+
+        dist_context._backup(serial=True, dist=True)
+        dist_context._restore(
+            serial=True,
+            serial_mode="to_backup",
+            dist=True,
+            dist_mode="to_backup")
+
+        dist_context._backup(serial=True, dist=True)
+        dist_context._restore(
+            serial=True,
+            serial_mode="to_original",
+            dist=True,
+            dist_mode="to_original")
+
+        dist_context._backup(serial=True, dist=True)
+        dist_context._restore(serial=True, dist=True, dist_mode="to_default")
+
+        dist_context._backup(serial=True, dist=True)
+        dist_context._restore(serial=True, dist=True, dist_mode="to_nothing")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py
index aa0bf719fab29..8af055a09a343 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py
@@ -94,7 +94,8 @@ def test_dist_slice_serial(self):
         ops = dist_main_prog.global_block().ops
         for op in ops:
             op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
-            assert op_dist_attr.impl_type == "slice"
+            # We amend this impl_type after completion
+            assert op_dist_attr.impl_type == "default"
             for out in op.output_arg_names:
                 var_dims_mapping = op_dist_attr.get_output_dims_mapping(out)
                 ref_dims_mapping = [-1 for i in range(len(var_dims_mapping))]
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py
index 1179fd9a9f088..9989f5bbdc605 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py
@@ -27,7 +27,7 @@
 from paddle.distributed.auto_parallel.utils import make_data_unshard
 from paddle.distributed.auto_parallel.dist_attribute import OperatorDistributedAttribute, TensorDistributedAttribute
 from paddle.distributed.auto_parallel.dist_context import DistributedContext, get_default_distributed_context
-from paddle.distributed.auto_parallel.operators import find_best_compatible_distributed_operator_impl
+from paddle.distributed.auto_parallel.operators import find_compatible_distributed_operator_impls
 from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
 
 paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_partition.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_partition.py
index 894bed7108a1d..d296d9433302d 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_partition.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_partition.py
@@ -28,7 +28,7 @@
 from paddle.distributed.auto_parallel.utils import make_data_unshard
 from paddle.distributed.auto_parallel.dist_attribute import OperatorDistributedAttribute, TensorDistributedAttribute
 from paddle.distributed.auto_parallel.dist_context import DistributedContext, get_default_distributed_context
-from paddle.distributed.auto_parallel.operators import find_best_compatible_distributed_operator_impl
+from paddle.distributed.auto_parallel.operators import find_compatible_distributed_operator_impls
 from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
 
 paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/collective_global_gather.py b/python/paddle/fluid/tests/unittests/collective_global_gather.py
index d3a6071ed04df..164abe0593491 100644
--- a/python/paddle/fluid/tests/unittests/collective_global_gather.py
+++ b/python/paddle/fluid/tests/unittests/collective_global_gather.py
@@ -23,6 +23,7 @@
 import paddle.fluid.layers as layers
 from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
 import pickle
+from paddle.fluid.framework import _enable_legacy_dygraph
 
 paddle.enable_static()
 
@@ -74,6 +75,9 @@ def run_trainer(self, args):
         world_size = 2
         tot_expert = n_expert * world_size
         paddle.disable_static()
+
+        # Call paddle.distributed.alltoall() under legacy dygraph
+        _enable_legacy_dygraph()
         np.random.seed(os.getpid())
         local_expert_count = np.random.randint(
             1, 4, size=tot_expert).astype("int64")
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py
index fb918f4ae00ed..2e2918facf896 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py
@@ -24,6 +24,7 @@
 from paddle.fluid.dygraph import ProgramTranslator
 from paddle.fluid.dygraph.dygraph_to_static.convert_call_func import CONVERSION_OPTIONS
 from test_program_translator import get_source_code
+import paddle.jit.dy2static as _jst
 
 program_translator = ProgramTranslator()
 
@@ -255,7 +256,7 @@ def _get_answer_code(self):
         return get_source_code(self.answer_func)
 
     def _get_transformed_code(self):
-        transformed_func = paddle.jit.dy2static.convert_call(self.func)
+        transformed_func = _jst.convert_call(self.func)
         return get_source_code(transformed_func)
 
     def test_code(self):
@@ -275,7 +276,7 @@ def set_func(self):
     def set_answer_func(self):
         class StaticCode():
             def func_convert_then_not_to_static(x):
-                y = paddle.jit.dy2static.convert_call(func_not_to_static)(x)
+                y = _jst.convert_call(func_not_to_static)(x)
                 return y
 
         self.answer_func = StaticCode.func_convert_then_not_to_static
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_drop_path.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_drop_path.py
new file mode 100644
index 0000000000000..7383c834ba9a4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_drop_path.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle
+
+
+def drop_path(x, training=False):
+    if not training:
+        return x
+    else:
+        return 2 * x
+
+
+class DropPath(paddle.nn.Layer):
+    def __init__(self):
+        super(DropPath, self).__init__()
+
+    @paddle.jit.to_static
+    def forward(self, x):
+        return drop_path(x, self.training)
+
+
+class TestTrainEval(unittest.TestCase):
+    def setUp(self):
+        self.model = DropPath()
+
+    def tearDown(self):
+        pass
+
+    def test_train_and_eval(self):
+        x = paddle.to_tensor([1, 2, 3]).astype("int64")
+        eval_out = x.numpy()
+        train_out = x.numpy() * 2
+        self.model.train()
+        self.assertTrue(np.allclose(self.model(x).numpy(), train_out))
+        self.model.eval()
+        self.assertTrue(np.allclose(self.model(x).numpy(), eval_out))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
index 9a9e7ee243872..276aa68e895c6 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
@@ -424,6 +424,41 @@ def test_ast_to_func(self):
         ProgramTranslator().enable(False)
 
 
+class IfElseNet(paddle.nn.Layer):
+    def __init__(self):
+        super(IfElseNet, self).__init__()
+        self.param = self.create_parameter(
+            shape=[3, 2], dtype='float32', is_bias=False)
+
+    @paddle.jit.to_static
+    def forward(self, a, b, c):
+        a = paddle.matmul(a, self.param)
+        a = paddle.reshape(a, (2, 4))
+        cond = paddle.to_tensor([10])
+        if cond == 10:
+            a_argmax = a.argmax(axis=-1)
+            b = b + self.param
+        else:
+            print(c)
+        return b
+
+
+class TestDy2StIfElseBackward(unittest.TestCase):
+    def test_run_backward(self):
+        a = paddle.randn((4, 3), dtype='float32')
+        a.stop_gradient = False
+        b = paddle.to_tensor([10]).astype('float32')
+        b.stop_gradient = False
+        c = paddle.to_tensor([2])
+        c.stop_gradient = False
+
+        net = IfElseNet()
+        net.train()
+        out = net(a, b, c)
+        out.backward()
+        self.assertTrue(np.allclose((b + net.param).numpy(), out.numpy()))
+
+
 if __name__ == '__main__':
     with paddle.fluid.framework._test_eager_guard():
         unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py
index e3d34184a38fc..8dac888993590 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py
@@ -65,7 +65,7 @@ def set_test_func(self):
         self.func = simple_func
 
     def set_static_lineno(self):
-        self.static_abs_lineno_list = [6, 7, 8]
+        self.static_abs_lineno_list = [7, 8, 9]
 
     def set_dygraph_info(self):
         self.line_num = 3
@@ -149,7 +149,7 @@ def set_test_func(self):
         self.func = nested_func
 
     def set_static_lineno(self):
-        self.static_abs_lineno_list = [6, 8, 9, 10, 11]
+        self.static_abs_lineno_list = [7, 9, 10, 11, 12]
 
     def set_dygraph_info(self):
         self.line_num = 5
@@ -174,7 +174,7 @@ def set_test_func(self):
         self.func = decorated_func
 
     def set_static_lineno(self):
-        self.static_abs_lineno_list = [6, 7]
+        self.static_abs_lineno_list = [7, 8]
 
     def set_dygraph_info(self):
         self.line_num = 2
@@ -208,7 +208,7 @@ def set_test_func(self):
         self.func = decorated_func2
 
     def set_static_lineno(self):
-        self.static_abs_lineno_list = [6, 7]
+        self.static_abs_lineno_list = [7, 8]
 
     def set_dygraph_info(self):
         self.line_num = 2
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
index 427e4c2252451..4f55dbd324c21 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
@@ -135,22 +135,23 @@ def test_switch_eval_and_train(self):
             x = fluid.dygraph.to_variable(x_data)
             linear_net(x)
 
-            _, partial_layer = linear_net.forward.program_cache.last()[-1]
+            _, train_partial_layer = linear_net.forward.program_cache.last()[-1]
             # check default mode is for training
-            self.assertEqual(partial_layer.program,
-                             partial_layer._train_program)
+            self.assertEqual(train_partial_layer.program,
+                             train_partial_layer._train_program)
 
             # switch to run test program after `eval()`
             linear_net.eval()
             linear_net(x)
-            self.assertEqual(partial_layer.program,
-                             partial_layer._infer_program)
+            _, eval_partial_layer = linear_net.forward.program_cache.last()[-1]
+            self.assertEqual(eval_partial_layer.program,
+                             eval_partial_layer._infer_program)
 
             # switch back into training
             linear_net.train()
             linear_net(x)
-            self.assertEqual(partial_layer.program,
-                             partial_layer._train_program)
+            self.assertEqual(train_partial_layer.program,
+                             train_partial_layer._train_program)
 
 
 class TestWithNoGrad(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
index b0ffbac88fb42..4e90c73baa944 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
@@ -27,6 +27,7 @@
 from paddle.fluid.dygraph.jit import declarative
 from paddle.fluid.dygraph.nn import Linear
 from paddle.fluid.dygraph.dygraph_to_static.utils import func_to_source_code
+import paddle.jit.dy2static as _jst
 
 from ifelse_simple_func import dyfunc_with_if_else
 
@@ -76,40 +77,38 @@ def false_fn_0(x_v):
             x_v = x_v + 1
             return x_v
 
-        x_v = paddle.jit.dy2static.convert_ifelse(
+        x_v = _jst.convert_ifelse(
             fluid.layers.mean(x_v)[0] > 5, true_fn_0, false_fn_0, (x_v, ),
             (x_v, ), (x_v, ))
-        __return_0 = paddle.jit.dy2static.create_bool_as_type(label is not None,
-                                                              False)
+        __return_0 = _jst.create_bool_as_type(label is not None, False)
 
         def true_fn_1(__return_0, __return_value_0, label, x_v):
             loss = fluid.layers.cross_entropy(x_v, label)
-            __return_0 = paddle.jit.dy2static.create_bool_as_type(
-                label is not None, True)
+            __return_0 = _jst.create_bool_as_type(label is not None, True)
             __return_value_0 = loss
             return __return_0, __return_value_0
 
         def false_fn_1(__return_0, __return_value_0):
             return __return_0, __return_value_0
 
-        __return_0, __return_value_0 = (paddle.jit.dy2static.convert_ifelse(
+        __return_0, __return_value_0 = _jst.convert_ifelse(
             label is not None, true_fn_1, false_fn_1,
             (__return_0, __return_value_0, label, x_v),
-            (__return_0, __return_value_0), (__return_0, __return_value_0)))
+            (__return_0, __return_value_0), (__return_0, __return_value_0))
 
         def true_fn_2(__return_0, __return_value_0, x_v):
-            __return_1 = paddle.jit.dy2static.create_bool_as_type(
-                paddle.jit.dy2static.convert_logical_not(__return_0), True)
+            __return_1 = _jst.create_bool_as_type(
+                _jst.convert_logical_not(__return_0), True)
             __return_value_0 = x_v
             return __return_value_0
 
         def false_fn_2(__return_value_0):
             return __return_value_0
 
-        __return_value_0 = paddle.jit.dy2static.convert_ifelse(
-            paddle.jit.dy2static.convert_logical_not(__return_0), true_fn_2,
-            false_fn_2, (__return_0, __return_value_0,
-                         x_v), (__return_value_0, ), (__return_value_0, ))
+        __return_value_0 = _jst.convert_ifelse(
+            _jst.convert_logical_not(__return_0), true_fn_2, false_fn_2,
+            (__return_0, __return_value_0,
+             x_v), (__return_value_0, ), (__return_value_0, ))
         return __return_value_0
 
 
@@ -128,40 +127,38 @@ def false_fn_3(x_v):
             x_v = x_v + 1
             return x_v
 
-        x_v = paddle.jit.dy2static.convert_ifelse(
+        x_v = _jst.convert_ifelse(
             fluid.layers.mean(x_v)[0] > 5, true_fn_3, false_fn_3, (x_v, ),
             (x_v, ), (x_v, ))
-        __return_2 = paddle.jit.dy2static.create_bool_as_type(label is not None,
-                                                              False)
+        __return_2 = _jst.create_bool_as_type(label is not None, False)
 
         def true_fn_4(__return_2, __return_value_1, label, x_v):
             loss = fluid.layers.cross_entropy(x_v, label)
-            __return_2 = paddle.jit.dy2static.create_bool_as_type(
-                label is not None, True)
+            __return_2 = _jst.create_bool_as_type(label is not None, True)
             __return_value_1 = loss
             return __return_2, __return_value_1
 
         def false_fn_4(__return_2, __return_value_1):
             return __return_2, __return_value_1
 
-        __return_2, __return_value_1 = paddle.jit.dy2static.convert_ifelse(
-            label is not None, true_fn_4, false_fn_4, (
-                __return_2, __return_value_1, label, x_v),
+        __return_2, __return_value_1 = _jst.convert_ifelse(
+            label is not None, true_fn_4, false_fn_4,
+            (__return_2, __return_value_1, label, x_v),
             (__return_2, __return_value_1), (__return_2, __return_value_1))
 
         def true_fn_5(__return_2, __return_value_1, x_v):
-            __return_3 = paddle.jit.dy2static.create_bool_as_type(
-                paddle.jit.dy2static.convert_logical_not(__return_2), True)
+            __return_3 = _jst.create_bool_as_type(
+                _jst.convert_logical_not(__return_2), True)
             __return_value_1 = x_v
             return __return_value_1
 
         def false_fn_5(__return_value_1):
             return __return_value_1
 
-        __return_value_1 = paddle.jit.dy2static.convert_ifelse(
-            paddle.jit.dy2static.convert_logical_not(__return_2), true_fn_5,
-            false_fn_5, (__return_2, __return_value_1,
-                         x_v), (__return_value_1, ), (__return_value_1, ))
+        __return_value_1 = _jst.convert_ifelse(
+            _jst.convert_logical_not(__return_2), true_fn_5, false_fn_5,
+            (__return_2, __return_value_1,
+             x_v), (__return_value_1, ), (__return_value_1, ))
         return __return_value_1
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
index d05be03bbfb19..5cf9d7749c358 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
@@ -597,9 +597,11 @@ def test(self):
 class TestPaddleShape(unittest.TestCase):
     def test_paddle_shape(self):
         func = paddle.jit.to_static(dyfunc_len_paddle_shape)
-        self.assertEqual('paddle.shape(x)' in func.code, True)
+        func_code = func.code.replace("\n", "").replace(" ", "")
+        self.assertEqual('paddle.shape(x)' in func_code, True)
         func = paddle.jit.to_static(dyfunc_dict_assign_shape)
-        self.assertEqual("__static_convert_var_shape_suffix" in func.code, True)
+        func_code = func.code.replace("\n", "").replace(" ", "")
+        self.assertEqual("__static_convert_var_shape_suffix" in func_code, True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/ipu/distributed/run_dist_ipu.sh b/python/paddle/fluid/tests/unittests/ipu/distributed/run_dist_ipu.sh
new file mode 100644
index 0000000000000..6f491ef107104
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/distributed/run_dist_ipu.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+  
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+partition_name=pod64
+vipu_server=10.137.96.62
+allclose_script="
+import sys
+import numpy as np
+data1 = np.loadtxt(\"ipu_res.txt\")
+data2 = np.loadtxt(\"cpu_res.txt\")
+if np.allclose(data1[::16], data2, atol=1e-6):
+    sys.exit(0)
+else:
+    sys.exit(1)
+"
+
+for opt in lamb sgd adam ;
+do
+    for onchip in False True ;
+    do
+        for rts in False True ;
+        do
+            echo "Testcase: opt: ${opt}, onchip: ${onchip}, rts: ${rts}"
+            echo "paddle.distributed.fleet.launch test with IPUs..."
+            python3.7 -m paddle.distributed.launch \
+            --device_num=8 \
+            ipu \
+            --hosts=localhost \
+            --nproc_per_host=2 \
+            --ipus_per_replica=2 \
+            --ipu_partition=${partition_name} \
+            --vipu_server=${vipu_server} \
+            test_dist_data_parallel_ipu.py ${opt} ipu_res.txt ${onchip} ${rts} > ipu.log
+            echo "paddle.distributed.fleet.launch test with IPUs...Done"
+
+            echo "paddle normal test with CPU..."
+            export POPLAR_IPUMODEL=1
+            python3.7 test_dist_data_parallel_ipu.py ${opt} cpu_res.txt > cpu.log
+            unset POPLAR_IPUMODEL
+            echo "paddle normal test with CPU...Done"
+
+            echo "Compare results..."
+            python3.7 -c """${allclose_script}"""
+            if [ $? -eq 0 ];then
+            echo "Compare results...Done"
+            else
+            echo "Error occurs. Please check ipu.log, cpu.log, ipu_res.txt and cpu_res.txt"
+            exit 0
+            fi
+        done
+    done
+done
+
+if [ -f "ipu.log" ]; then
+    rm "ipu.log"
+fi
+if [ -f "cpu.log" ]; then
+    rm "cpu.log"
+fi
+if [ -f "ipu_res.txt" ]; then
+    rm "ipu_res.txt"
+fi
+if [ -f "cpu_res.txt" ]; then
+    rm "cpu_res.txt"
+fi
diff --git a/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_data_parallel_ipu.py b/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_data_parallel_ipu.py
new file mode 100644
index 0000000000000..6054f2be7579e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_data_parallel_ipu.py
@@ -0,0 +1,184 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import sys
+import os
+import random
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+
+mpi_comm = None
+
+
+@unittest.skip('Disable distributed tests on auto CI.')
+class TestBase(IPUOpTest):
+    def set_attrs(self, enable_ipu, optimizer, log, onchip=False, rts=False):
+        self.ipu_options = {
+            "enable_pipelining": True,
+            "batches_per_step": 1,
+            "enable_gradient_accumulation": True,
+            "accumulation_factor": 4,
+            "enable_replicated_graphs": True,
+            "replicated_graph_count": 2,
+            "location_optimizer": {
+                "on_chip": onchip,
+                "use_replicated_tensor_sharding": rts
+            }
+        }
+
+        self.cpu_bs = 16
+        self.ipu_bs = 1
+        self.optimizer = optimizer
+        self.log = log
+        self.enable_ipu = enable_ipu
+
+    def test(self):
+        seed = 2021
+        np.random.seed(seed)
+        random.seed(seed)
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = seed
+        startup_prog.random_seed = seed
+
+        bs = self.ipu_bs if self.enable_ipu else self.cpu_bs
+        data = np.random.rand(1, 3, 10, 10).astype(np.float32)
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                image = paddle.static.data(
+                    name='image', shape=[bs, 3, 10, 10], dtype='float32')
+                with paddle.static.ipu_shard_guard(index=0, stage=0):
+                    conv1 = paddle.static.nn.conv2d(
+                        image, num_filters=3, filter_size=3, bias_attr=False)
+                with paddle.static.ipu_shard_guard(index=1, stage=1):
+                    conv2 = paddle.static.nn.conv2d(
+                        conv1, num_filters=3, filter_size=3, bias_attr=False)
+                    # should consider influence of bs
+                    loss = paddle.mean(conv2)
+
+                if self.optimizer == 'sgd':
+                    opt = paddle.optimizer.SGD(learning_rate=1e-2)
+                elif self.optimizer == 'adam':
+                    opt = paddle.optimizer.Adam(learning_rate=1e-2)
+                elif self.optimizer == 'lamb':
+                    opt = paddle.optimizer.Lamb(learning_rate=1e-2)
+                else:
+                    raise Exception('optimizer must be sgd, adam or lamb')
+
+                opt.minimize(loss)
+
+                if self.enable_ipu:
+                    place = paddle.IPUPlace()
+                else:
+                    place = paddle.CPUPlace()
+                executor = paddle.static.Executor(place)
+                executor.run(startup_prog)
+
+                if self.enable_ipu:
+                    feed_list = [image.name]
+                    fetch_list = [loss.name]
+                    ipu_strategy = paddle.static.IpuStrategy()
+                    ipu_strategy.set_graph_config(
+                        num_ipus=2 * self.ipu_options['replicated_graph_count'],
+                        is_training=True,
+                        enable_manual_shard=True)
+                    ipu_strategy.set_options(self.ipu_options)
+                    ipu_strategy.set_options({
+                        "enable_distribution": True,
+                        "enable_distributed_replicated_graphs": True,
+                        "global_replica_offset":
+                        int(os.environ.get("PADDLE_TRAINER_ID")) * 2,
+                        "global_replication_factor": 4
+                    })
+                    program = paddle.static.IpuCompiledProgram(
+                        main_prog, ipu_strategy=ipu_strategy).compile(
+                            feed_list, fetch_list)
+                    feed = {
+                        "image": np.tile(data, [
+                            self.ipu_options['replicated_graph_count'] *
+                            self.ipu_options['batches_per_step'] *
+                            self.ipu_options['accumulation_factor'], 1, 1, 1
+                        ])
+                    }
+
+                else:
+                    program = main_prog
+                    feed = {"image": np.tile(data, [self.cpu_bs, 1, 1, 1])}
+
+                epoch = 10
+                if not self.enable_ipu:
+                    # global replication factor
+                    epoch *= 4
+                    epoch *= self.ipu_options['batches_per_step']
+                    epoch *= self.ipu_options['accumulation_factor']
+                    epoch = epoch / (self.cpu_bs / self.ipu_bs)
+
+                results = []
+                for i in range(int(epoch)):
+                    res = executor.run(program, feed=feed, fetch_list=[loss])
+                    if self.enable_ipu:
+                        res = mpi_comm.gather(res, root=0)
+                    results.append(res)
+                if self.enable_ipu:
+                    if int(os.environ.get("PADDLE_TRAINER_ID")) == 0:
+                        np.savetxt(self.log, np.array(results).flatten())
+                else:
+                    np.savetxt(self.log, np.array(results).flatten())
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    # Run distributed tests    
+    if len(sys.argv) == 5:
+        from mpi4py import MPI
+
+        DISTRIBUTED_COMM = MPI.COMM_WORLD
+
+        def _get_comm():
+            global DISTRIBUTED_COMM
+            if DISTRIBUTED_COMM is None:
+                raise RuntimeError(
+                    "Distributed Commumication not setup. Please run setup_comm(MPI.COMM_WORLD) first."
+                )
+            return DISTRIBUTED_COMM
+
+        mpi_comm = _get_comm()
+
+        optimizer = sys.argv[1]
+        log = sys.argv[2]
+        onchip = True if sys.argv[3] == "True" else False
+        rts = True if sys.argv[4] == "True" else False
+        test = TestBase()
+        test.set_attrs(
+            enable_ipu=True,
+            optimizer=optimizer,
+            log=log,
+            onchip=onchip,
+            rts=rts)
+        test.test()
+    # Run cpu tests for compare
+    elif len(sys.argv) == 3:
+        test = TestBase()
+        test.set_attrs(enable_ipu=False, optimizer=sys.argv[1], log=sys.argv[2])
+        test.test()
+    else:
+        raise ValueError(
+            "Only support 3 or 5 args. 3 for cpu test, 5 for ipu distributed test"
+        )
diff --git a/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_pod128_sample.py b/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_pod128_sample.py
new file mode 100644
index 0000000000000..44c26d123ba39
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_pod128_sample.py
@@ -0,0 +1,111 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''
+python3.7 -m paddle.distributed.launch \
+--device_num=128 \
+ipu \
+--hosts=host1,host2 \
+--ipus_per_host=2 \
+--nproc_per_host=1 \
+--ipu_partition=pod128 \
+--vipu_server=lr17-1-ctrl \
+python/paddle/fluid/tests/unittests/ipu/disabled/test_dist_pod128_ipu.py
+
+Equal to:
+
+poprun \
+--host=localhost,host2 \
+--num-instances=2 \
+--num-replicas=64 \
+--ipus-per-replica=2 \
+--print-topology=yes \
+--vipu-partition=pod128_bert \
+--vipu-server-host=lr17-1-ctrl \
+--update-partition=yes \
+python3.7 python/paddle/fluid/tests/unittests/ipu/disabled/test_dist_pod128_ipu.py
+'''
+
+import os
+import numpy as np
+import paddle
+
+
+def TestDistTraining():
+    paddle.enable_static()
+
+    attrs = {"size": [128, 16], "padding_idx": -1, "dtype": 'float32'}
+
+    scope = paddle.fluid.core.Scope()
+    main_prog = paddle.static.Program()
+    startup_prog = paddle.static.Program()
+    main_prog.random_seed = 42
+    startup_prog.random_seed = 42
+
+    np.random.seed(42)
+    input_data = np.random.uniform(0, 127, size=[128, 3, 2, 1]).astype(np.int32)
+
+    with paddle.fluid.scope_guard(scope):
+        with paddle.static.program_guard(main_prog, startup_prog):
+            x = paddle.static.data(name="x", shape=[3, 2, 1], dtype='int64')
+            with paddle.static.ipu_shard_guard(index=0, stage=0):
+                out = paddle.fluid.layers.embedding(x, **attrs)
+            with paddle.static.ipu_shard_guard(index=1, stage=1):
+                loss = paddle.mean(out)
+            opt = paddle.optimizer.Adam(learning_rate=1e-1)
+            opt.minimize(loss)
+
+            feed_list = ["x"]
+            fetch_list = [loss.name]
+
+            place = paddle.IPUPlace()
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            ipu_strategy = paddle.static.IpuStrategy()
+            ipu_strategy.set_graph_config(
+                num_ipus=64, is_training=True, enable_manual_shard=True)
+            ipu_strategy.set_pipelining_config(
+                enable_pipelining=True,
+                batches_per_step=1,
+                enable_gradient_accumulation=True,
+                accumulation_factor=4)
+            ipu_strategy.set_options({
+                "enable_distribution": True,
+                "enable_replicated_graphs": True,
+                "replicated_graph_count": 32,
+                "enable_distributed_replicated_graphs": True,
+                "global_replica_offset":
+                # Paddle : int(os.environ.get("PADDLE_TRAINER_ID")) * 32
+                # PopRun : int(os.environ.get("POPDIST_REPLICA_INDEX_OFFSET"))
+                int(os.environ.get("PADDLE_TRAINER_ID")) * 32,
+                "global_replication_factor": 64,
+                "location_optimizer": {
+                    "on_chip": False,
+                    "use_replicated_tensor_sharding": True
+                }
+            })
+
+            ipu_program = paddle.static.IpuCompiledProgram(
+                main_prog, ipu_strategy=ipu_strategy)
+            program = ipu_program.compile(feed_list, fetch_list)
+
+            for i in range(10):
+                res = exe.run(program,
+                              feed={"x": input_data},
+                              fetch_list=fetch_list)
+                print("index: {}, result: {}".format(i, res))
+
+
+if __name__ == "__main__":
+    TestDistTraining()
diff --git a/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_sample.py b/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_sample.py
new file mode 100644
index 0000000000000..6ca9222d914de
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_sample.py
@@ -0,0 +1,177 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''
+Single host:
+
+python3.7 -m paddle.distributed.launch \
+--device_num=4 \
+ipu \
+--hosts=localhost \
+--nproc_per_host=2 \
+--ipus_per_replica=1 \
+--ipu_partition=pod64 \
+--vipu_server=10.137.96.62 \
+python/paddle/fluid/tests/unittests/ipu/disabled/test_dist_sample.py
+
+Equal to:
+
+poprun \
+--host=localhost \
+--num-instances=2 \
+--num-replicas=4 \
+--ipus-per-replica=1 \
+--print-topology=yes \
+python3.7 python/paddle/fluid/tests/unittests/ipu/disabled/test_dist_sample.py
+'''
+'''
+Multi hosts:
+
+python3.7 -m paddle.distributed.launch \
+--device_num=4 \
+ipu \
+--hosts=host1,host2 \
+--nproc_per_host=1 \
+--ipus_per_replica=1 \
+--ipu_partition=pod64 \
+--vipu_server=10.137.96.62 \
+python/paddle/fluid/tests/unittests/ipu/disabled/test_dist_sample.py
+
+Equal to:
+
+poprun \
+--host=host1,host2 \
+--num-instances=2 \
+--num-replicas=4 \
+--ipus-per-replica=1 \
+--print-topology=yes \
+python3.7 python/paddle/fluid/tests/unittests/ipu/disabled/test_dist_sample.py
+'''
+
+import os
+import sys
+import paddle
+import numpy as np
+
+mpi_comm = None
+
+
+def Test(use_dist, file_name):
+    paddle.enable_static()
+
+    attrs = {"size": [128, 16], "padding_idx": -1, "dtype": 'float32'}
+
+    scope = paddle.fluid.core.Scope()
+    main_prog = paddle.static.Program()
+    startup_prog = paddle.static.Program()
+    main_prog.random_seed = 42
+    startup_prog.random_seed = 42
+
+    with paddle.fluid.scope_guard(scope):
+        with paddle.static.program_guard(main_prog, startup_prog):
+            x = paddle.static.data(name="x", shape=[3, 2, 1], dtype='int64')
+
+            out = paddle.fluid.layers.embedding(x, **attrs)
+            loss = paddle.mean(out)
+            opt = paddle.optimizer.Adam(learning_rate=1e-1)
+            opt.minimize(loss)
+
+            feed_list = ["x"]
+            fetch_list = [loss.name]
+
+            place = paddle.IPUPlace()
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            ipu_strategy = paddle.static.IpuStrategy()
+            if use_dist:
+                ipu_strategy.set_graph_config(num_ipus=2, is_training=True)
+                # Set distributed envs
+                ipu_strategy.set_options({
+                    "enable_distribution": True,
+                    "enable_replicated_graphs": True,
+                    "replicated_graph_count": 2,
+                    "enable_distributed_replicated_graphs": True,
+                    "global_replica_offset":
+                    int(os.environ.get("PADDLE_TRAINER_ID")) * 2,
+                    "global_replication_factor": 4
+                })
+            else:
+                ipu_strategy.set_graph_config(num_ipus=4, is_training=True)
+                ipu_strategy.set_options({
+                    "enable_replicated_graphs": True,
+                    "replicated_graph_count": 4,
+                })
+
+            ipu_program = paddle.static.IpuCompiledProgram(
+                main_prog, ipu_strategy=ipu_strategy)
+            program = ipu_program.compile(feed_list, fetch_list)
+
+            if use_dist:
+                if os.environ.get("PADDLE_TRAINER_ID") == "0":
+                    input_data = np.concatenate([
+                        np.array([[[1], [3]], [[2], [4]], [[4], [127]]])
+                        .astype(np.int32), np.array(
+                            [[[1], [3]], [[2], [4]], [[4], [127]]]).astype(
+                                np.int32)
+                    ])
+                else:
+                    input_data = np.concatenate([
+                        np.array([[[8], [60]], [[50], [77]],
+                                  [[90], [13]]]).astype(np.int32),
+                        np.array([[[8], [60]], [[50], [77]],
+                                  [[90], [13]]]).astype(np.int32)
+                    ])
+            else:
+                input_data = np.concatenate([
+                    np.array([[[1], [3]], [[2], [4]], [[4], [127]]]).astype(
+                        np.int32), np.array([[[1], [3]], [[2], [4]],
+                                             [[4], [127]]]).astype(np.int32),
+                    np.array([[[8], [60]], [[50], [77]], [[90], [13]]]).astype(
+                        np.int32), np.array([[[8], [60]], [[50], [77]],
+                                             [[90], [13]]]).astype(np.int32)
+                ])
+            feed_data = {"x": input_data}
+
+            for step in range(10):
+                res = exe.run(program, feed=feed_data, fetch_list=fetch_list)
+
+            if use_dist:
+                if os.getenv("PADDLE_TRAINER_ID") == "0":
+                    res = mpi_comm.gather(res, root=0)
+                    np.savetxt(file_name, res)
+            else:
+                np.savetxt(file_name, res)
+
+
+if __name__ == "__main__":
+    file_name = sys.argv[1]
+
+    use_dist = False
+    if 'PADDLE_TRAINER_ID' in os.environ:
+        from mpi4py import MPI
+
+        DISTRIBUTED_COMM = MPI.COMM_WORLD
+
+        def _get_comm():
+            global DISTRIBUTED_COMM
+            if DISTRIBUTED_COMM is None:
+                raise RuntimeError(
+                    "Distributed Commumication not setup. Please run setup_comm(MPI.COMM_WORLD) first."
+                )
+            return DISTRIBUTED_COMM
+
+        mpi_comm = _get_comm()
+        use_dist = True
+
+    Test(use_dist, file_name)
diff --git a/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py b/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
index 2583d9409a0a7..ad11083b67773 100644
--- a/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
@@ -128,6 +128,11 @@ def setUpClass(cls):
         cls.fetch_list: List[str] = None
         cls.output_dict: Optional[Dict] = {}
 
+    def tearDown(self):
+        # Manual reset when using ipumodel
+        if self.use_ipumodel():
+            paddle.framework.core.IpuBackend.get_instance().reset()
+
     @property
     def fp16_enabled(self):
         return True
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
index 487a69807e2b0..39f55fb45b87b 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
@@ -61,7 +61,6 @@ def setUp(self):
         self.fuse_activation = ""
         self.fuse_alpha = 0
         self.fuse_beta = 0
-        self.fuse_brelu_threshold = 6.0
         self.fuse_residual_connection = False
         self.input_residual_size = None
 
@@ -99,7 +98,6 @@ def setUp(self):
         self.attrs['fuse_activation'] = self.fuse_activation
         self.attrs['fuse_alpha'] = self.fuse_alpha
         self.attrs['fuse_beta'] = self.fuse_beta
-        self.attrs['fuse_brelu_threshold'] = self.fuse_brelu_threshold
         self.attrs['fuse_residual_connection'] = self.fuse_residual_connection
 
         self.outputs['Output'] = output
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_mkldnn_op.py
index 46ee2a14a2018..7b0bb706aece9 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_mkldnn_op.py
@@ -14,7 +14,7 @@
 
 import unittest
 import numpy as np
-from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci
+from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool, skip_check_grad_ci
 import paddle.fluid as fluid
 import paddle
 
@@ -92,6 +92,17 @@ def setUp(self):
         self.outputs = {'Out': self.inputs['X'].sum()}
 
 
+@OpTestTool.skip_if_not_cpu()
+class TestReduceSum4DNoReduceSimpleCopyOneDNNOp(
+        TestReduceDefaultWithGradOneDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float32")}
+        self.attrs = {'dim': tuple(), 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': np.copy(self.inputs['X'])}
+
+
 @skip_check_grad_ci(
     reason="reduce_max is discontinuous non-derivable function,"
     " its gradient check is not supported by unittest framework.")
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_abs_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_abs_op_mlu.py
new file mode 100644
index 0000000000000..0c33bd6b1ade8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_abs_op_mlu.py
@@ -0,0 +1,95 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append('..')
+from op_test import OpTest
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+import paddle
+import paddle.nn.functional as F
+
+paddle.enable_static()
+np.random.seed(10)
+
+
+class TestAbs(OpTest):
+    def setUp(self):
+        self.op_type = "abs"
+        self.set_mlu()
+        self.dtype = 'float32'
+        self.shape = [4, 25]
+
+        np.random.seed(1024)
+        x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        # Because we set delta = 0.005 in calculating numeric gradient,
+        # if x is too small, such as 0.002, x_neg will be -0.003
+        # x_pos will be 0.007, so the numeric gradient is inaccurate.
+        # we should avoid this
+        x[np.abs(x) < 0.005] = 0.02
+        out = np.abs(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            self.place, ['X'], ['Out'], check_eager=False)
+
+
+class TestAbsHalf(OpTest):
+    def setUp(self):
+        self.op_type = "abs"
+        self.set_mlu()
+        self.dtype = 'float16'
+        self.shape = [7, 9, 13, 19]
+
+        np.random.seed(1024)
+        x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        # Because we set delta = 0.005 in calculating numeric gradient,
+        # if x is too small, such as 0.002, x_neg will be -0.003
+        # x_pos will be 0.007, so the numeric gradient is inaccurate.
+        # we should avoid this
+        x[np.abs(x) < 0.005] = 0.02
+        out = np.abs(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            self.place, ['X'], ['Out'], check_eager=False)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_coalesce_tensor_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_coalesce_tensor_op_mlu.py
new file mode 100644
index 0000000000000..854ac0b6826cd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_coalesce_tensor_op_mlu.py
@@ -0,0 +1,109 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append('..')
+from op_test import OpTest
+from paddle.fluid import core
+import paddle
+
+alignment = 256
+paddle.enable_static()
+
+
+class TestAllocContinuousSpace(OpTest):
+    def setUp(self):
+        self.op_type = "coalesce_tensor"
+        self.dtype, self.fluid_dtype = self.init_dtype()
+        attrs = self.init_attr()
+        self.copy_data = attrs["copy_data"]
+        self.constant = attrs["constant"]
+        self.set_constant = attrs["set_constant"]
+        self.Inputs = self.init_input()
+        self.Outputs, self.FusedOutput = self.init_output(
+            self.Inputs, self.set_constant, self.constant)
+        self.inputs = {'Input': self.Inputs}
+        self.attrs = attrs
+        self.outputs = {'Output': self.Outputs, 'FusedOutput': self.FusedOutput}
+
+    def init_dtype(self):
+        return np.float32, int(core.VarDesc.VarType.FP32)
+
+    def init_input(self):
+        inputs = []
+        inputs.append(("x1", np.random.random([20, 3]).astype(self.dtype)))
+        inputs.append(("x2", np.random.random([20]).astype(self.dtype)))
+        inputs.append(("x3", np.random.random([1]).astype(self.dtype)))
+        inputs.append(("x4", np.random.random([200, 30]).astype(self.dtype)))
+        inputs.append(("x5", np.random.random([30]).astype(self.dtype)))
+        inputs.append(("x6", np.random.random([1]).astype(self.dtype)))
+        return inputs
+
+    def init_attr(self):
+        return {
+            "copy_data": True,
+            "set_constant": False,
+            "constant": 0.0,
+            "dtype": self.fluid_dtype
+        }
+
+    def init_output(self, input_list, set_constant, constant):
+        inputs = []
+        outputs = input_list
+
+        for input in input_list:
+            length = len(input[1].flatten())
+            aligned_len = (length + alignment) / alignment * alignment
+            out = np.zeros(int(aligned_len))
+            out[0:length] = input[1].flatten()
+            inputs.append(out)
+
+        coalesce_tensor_var = np.concatenate([input for input in inputs])
+        if set_constant:
+            coalesce_tensor_var = np.ones((len(coalesce_tensor_var))) * constant
+            outputs = [(out[0],
+                        np.ones(out[1].shape).astype(self.dtype) * constant)
+                       for out in outputs]
+        return outputs, coalesce_tensor_var
+
+    def test_check_output(self):
+        self.check_output_with_place(
+            place=paddle.device.MLUPlace(0),
+            no_check_set=["FusedOutput"],
+            atol=1e-5)
+
+
+class TestAllocContinuousSpace2(TestAllocContinuousSpace):
+    def init_attr(self):
+        return {
+            "copy_data": False,
+            "set_constant": True,
+            "constant": 5,
+            "dtype": self.fluid_dtype,
+            "user_defined_size_of_dtype": 2
+        }
+
+    def test_check_output(self):
+        self.check_output_with_place(
+            place=paddle.device.MLUPlace(0),
+            no_check_set=["FusedOutput"],
+            atol=1e-5)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_log_softmax_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_log_softmax_op_mlu.py
new file mode 100644
index 0000000000000..dea6391b8bae0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_log_softmax_op_mlu.py
@@ -0,0 +1,163 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
+import paddle
+import paddle.fluid.core as core
+import paddle.nn.functional as F
+
+np.random.seed(10)
+paddle.enable_static()
+
+
+def ref_log_softmax(x):
+    shiftx = (x - np.max(x))
+    out = shiftx - np.log(np.exp(shiftx).sum())
+    return out
+
+
+def ref_log_softmax_grad(x, axis):
+    if axis < 0:
+        axis += len(x.shape)
+    out = np.apply_along_axis(ref_log_softmax, axis, x)
+    axis_dim = x.shape[axis]
+    dout = np.full_like(x, fill_value=1. / x.size)
+    dx = dout - np.exp(out) * dout.copy().sum(axis=axis, keepdims=True).repeat(
+        axis_dim, axis=axis)
+    return dx
+
+
+class TestLogSoftmaxOp(OpTest):
+    def setUp(self):
+        self.op_type = 'log_softmax'
+        self.set_mlu()
+        self.python_api = F.log_softmax
+        self.dtype = 'float32'
+        self.shape = [2, 3, 4, 5]
+        self.axis = -1
+        self.set_attrs()
+
+        x = np.random.uniform(0.1, 1., self.shape).astype(self.dtype)
+        out = np.apply_along_axis(ref_log_softmax, self.axis, x)
+        self.x_grad = ref_log_softmax_grad(x, self.axis)
+
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}
+        self.attrs = {'axis': self.axis}
+
+    def set_attrs(self):
+        pass
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            self.place, ['X'], ['Out'], user_defined_grads=[self.x_grad])
+
+
+class TestLogSoftmaxShape(TestLogSoftmaxOp):
+    def set_attrs(self):
+        self.shape = [12, 10]
+
+
+class TestLogSoftmaxAxis(TestLogSoftmaxOp):
+    def set_attrs(self):
+        self.axis = 1
+
+
+class TestNNLogSoftmaxAPI(unittest.TestCase):
+    def setUp(self):
+        self.set_mlu()
+        self.x_shape = [2, 3, 4, 5]
+        self.x = np.random.uniform(-1., 1., self.x_shape).astype(np.float32)
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def check_api(self, axis=-1):
+        ref_out = np.apply_along_axis(ref_log_softmax, axis, self.x)
+
+        logsoftmax = paddle.nn.LogSoftmax(axis)
+        # test static api
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.fluid.data(name='x', shape=self.x_shape)
+            y = logsoftmax(x)
+            exe = paddle.static.Executor(self.place)
+            out = exe.run(feed={'x': self.x}, fetch_list=[y])
+        self.assertTrue(np.allclose(out[0], ref_out))
+
+        # test dygrapg api
+        paddle.disable_static()
+        x = paddle.to_tensor(self.x)
+        y = logsoftmax(x)
+        self.assertTrue(np.allclose(y.numpy(), ref_out))
+        paddle.enable_static()
+
+    def test_check_api(self):
+        for axis in [-1, 1]:
+            self.check_api(axis)
+
+
+class TestNNFunctionalLogSoftmaxAPI(unittest.TestCase):
+    def setUp(self):
+        self.set_mlu()
+        self.x_shape = [2, 3, 4, 5]
+        self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def check_api(self, axis=-1, dtype=None):
+        x = self.x.copy()
+        if dtype is not None:
+            x = x.astype(dtype)
+        ref_out = np.apply_along_axis(ref_log_softmax, axis, x)
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.fluid.data(name='x', shape=self.x_shape)
+            y = F.log_softmax(x, axis, dtype)
+            exe = paddle.static.Executor(self.place)
+            out = exe.run(feed={'x': self.x}, fetch_list=[y])
+        self.assertTrue(np.allclose(out[0], ref_out))
+
+        paddle.disable_static()
+        x = paddle.to_tensor(self.x)
+        y = F.log_softmax(x, axis, dtype)
+        self.assertTrue(np.allclose(y.numpy(), ref_out), True)
+        paddle.enable_static()
+
+    def test_check_api(self):
+        for axis in [-1, 1]:
+            self.check_api(axis)
+        self.check_api(-1, 'float32')
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.fluid.data(name='X1', shape=[100], dtype='int32')
+            self.assertRaises(TypeError, F.log_softmax, x)
+
+            x = paddle.fluid.data(name='X2', shape=[100], dtype='float32')
+            self.assertRaises(TypeError, F.log_softmax, x, dtype='int32')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_one_hot_v2_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_one_hot_v2_op_mlu.py
new file mode 100644
index 0000000000000..a56e9ff7558f6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_one_hot_v2_op_mlu.py
@@ -0,0 +1,235 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import math
+import sys
+sys.path.append('..')
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.framework as framework
+from paddle.fluid.framework import Program, program_guard, _test_eager_guard
+
+paddle.enable_static()
+
+
+class TestOneHotOp(OpTest):
+    def setUp(self):
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+        self.op_type = 'one_hot_v2'
+        depth = 10
+        depth_np = np.array(10).astype('int32')
+        dimension = 12
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+        x = np.array(x).astype('int32').reshape([sum(x_lod[0])])
+
+        out = np.zeros(shape=(np.product(x.shape), depth)).astype('float32')
+
+        for i in range(np.product(x.shape)):
+            out[i, x[i]] = 1.0
+
+        self.inputs = {'X': (x, x_lod), 'depth_tensor': depth_np}
+        self.attrs = {'dtype': int(core.VarDesc.VarType.FP32)}
+        self.outputs = {'Out': (out, x_lod)}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestOneHotOp_attr(OpTest):
+    def setUp(self):
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+        self.op_type = 'one_hot_v2'
+        depth = 10
+        dimension = 12
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+        x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
+
+        out = np.zeros(shape=(np.product(x.shape[:-1]), 1,
+                              depth)).astype('float32')
+
+        for i in range(np.product(x.shape)):
+            out[i, 0, x[i]] = 1.0
+
+        self.inputs = {'X': (x, x_lod)}
+        self.attrs = {'dtype': int(core.VarDesc.VarType.FP32), 'depth': depth}
+        self.outputs = {'Out': (out, x_lod)}
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestOneHotOp_default_dtype(OpTest):
+    def setUp(self):
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+        self.op_type = 'one_hot_v2'
+        depth = 10
+        depth_np = np.array(10).astype('int32')
+        dimension = 12
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+        x = np.array(x).astype('int32').reshape([sum(x_lod[0])])
+
+        out = np.zeros(shape=(np.product(x.shape), depth)).astype('float32')
+
+        for i in range(np.product(x.shape)):
+            out[i, x[i]] = 1.0
+
+        self.inputs = {'X': (x, x_lod), 'depth_tensor': depth_np}
+        self.attrs = {}
+        self.outputs = {'Out': (out, x_lod)}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestOneHotOp_default_dtype_attr(OpTest):
+    def setUp(self):
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+        self.op_type = 'one_hot_v2'
+        depth = 10
+        dimension = 12
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+        x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
+
+        out = np.zeros(shape=(np.product(x.shape[:-1]), 1,
+                              depth)).astype('float32')
+
+        for i in range(np.product(x.shape)):
+            out[i, 0, x[i]] = 1.0
+
+        self.inputs = {'X': (x, x_lod)}
+        self.attrs = {'depth': depth}
+        self.outputs = {'Out': (out, x_lod)}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestOneHotOp_exception(unittest.TestCase):
+    def setUp(self):
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+        self.op_type = 'one_hot_v2'
+        self.depth = 10
+        self.place = core.CPUPlace()
+        self.dimension = 12
+        self.x = core.LoDTensor()
+        x_lod = [[4, 1, 3, 3]]
+        data = [np.random.randint(11, 20) for i in range(sum(x_lod[0]))]
+        data = np.array(data).astype('int').reshape([sum(x_lod[0]), 1])
+        self.x.set(data, self.place)
+        self.x.set_recursive_sequence_lengths(x_lod)
+
+    def test_check_output(self):
+        program = Program()
+        with program_guard(program):
+            x = fluid.layers.data(
+                name='x', shape=[self.dimension], dtype='float32', lod_level=1)
+            block = program.current_block()
+            one_hot_out = block.create_var(
+                name="one_hot_out",
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                dtype='float32')
+            block.append_op(
+                type='one_hot',
+                inputs={'X': x},
+                attrs={'depth': self.depth},
+                outputs={'Out': one_hot_out})
+            exe = fluid.Executor(self.place)
+
+            def run():
+                exe.run(feed={'x': self.x},
+                        fetch_list=[one_hot_out],
+                        return_numpy=False)
+
+            self.assertRaises(ValueError, run)
+
+
+class TestOneHotOpApi(unittest.TestCase):
+    def setUp(self):
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+
+    def test_api(self):
+        depth = 10
+        self._run(depth)
+
+    def test_api_with_depthTensor(self):
+        depth = fluid.layers.assign(input=np.array([10], dtype=np.int32))
+        self._run(depth)
+
+    def test_api_with_dygraph(self):
+        depth = 10
+        label = np.array([np.random.randint(0, depth - 1)
+                          for i in range(6)]).reshape([6, 1])
+        with fluid.dygraph.guard():
+            one_hot_label = fluid.one_hot(
+                input=fluid.dygraph.to_variable(label), depth=depth)
+
+            one_hot_label = paddle.nn.functional.one_hot(
+                fluid.dygraph.to_variable(label), depth)
+            # with _test_eager_guard():
+            #     one_hot_label = paddle.nn.functional.one_hot(
+            #         paddle.to_tensor(label), depth)
+
+    def _run(self, depth):
+        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+        one_hot_label = fluid.one_hot(input=label, depth=depth)
+
+        label_data = np.array([np.random.randint(0, 10 - 1)
+                               for i in range(6)]).reshape([6, 1])
+
+        exe = fluid.Executor(self.place)
+        exe.run(fluid.default_startup_program())
+        ret = exe.run(feed={'label': label_data, },
+                      fetch_list=[one_hot_label],
+                      return_numpy=False)
+
+
+class BadInputTestOnehotV2(unittest.TestCase):
+    def setUp(self):
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+
+    def test_error(self):
+        with fluid.program_guard(fluid.Program()):
+
+            def test_bad_x():
+                label = fluid.layers.data(
+                    name="label",
+                    shape=[4],
+                    append_batch_size=False,
+                    dtype="float32")
+                one_hot_label = fluid.one_hot(input=label, depth=4)
+
+            self.assertRaises(TypeError, test_bad_x)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_arg_max_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_arg_max_op_npu.py
index 85ade1179b7d6..c6135383721e1 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_arg_max_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_arg_max_op_npu.py
@@ -328,5 +328,32 @@ def run(place):
             run(place)
 
 
+class TestArgMaxAPI_3(unittest.TestCase):
+    def initTestCase(self):
+        self.dims = (1, 9)
+        self.dtype = 'float32'
+
+    def setUp(self):
+        self.initTestCase()
+        self.__class__.use_npu = True
+        self.place = [paddle.NPUPlace(0)]
+
+    def test_dygraph_api(self):
+        def run(place):
+            paddle.disable_static(place)
+            np.random.seed(2021)
+            numpy_input = (np.random.random(self.dims)).astype(self.dtype)
+            tensor_input = paddle.to_tensor(numpy_input)
+            numpy_output = np.argmax(numpy_input).reshape([1])
+            paddle_output = paddle.argmax(tensor_input)
+            self.assertEqual(
+                np.allclose(numpy_output, paddle_output.numpy()), True)
+            self.assertEqual(numpy_output.shape, paddle_output.numpy().shape)
+            paddle.enable_static()
+
+        for place in self.place:
+            run(place)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
index 955f2117778f0..919ae52447128 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
@@ -253,6 +253,9 @@ def test_grad(self):
 
 
 class TestCELUDoubleGradCheck(unittest.TestCase):
+    def celu_wrapper(self, x):
+        return paddle.nn.functional.celu(x[0], alpha=0.2)
+
     @prog_scope()
     def func(self, place):
         shape = [2, 4, 4, 4]
@@ -269,6 +272,8 @@ def func(self, place):
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.celu_wrapper, [x], y, x_init=x_arr, place=place)
 
     def test_grad(self):
         paddle.enable_static()
@@ -280,6 +285,9 @@ def test_grad(self):
 
 
 class TestSqrtDoubleGradCheck(unittest.TestCase):
+    def sqrt_wrapper(self, x):
+        return paddle.sqrt(x[0])
+
     @prog_scope()
     def func(self, place):
         shape = [2, 3, 7, 9]
@@ -294,6 +302,8 @@ def func(self, place):
 
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.sqrt_wrapper, [x], y, x_init=x_arr, place=place)
 
     def test_grad(self):
         paddle.enable_static()
@@ -305,6 +315,9 @@ def test_grad(self):
 
 
 class TestRsqrtDoubleGradCheck(unittest.TestCase):
+    def rsqrt_wrapper(self, x):
+        return paddle.rsqrt(x[0])
+
     @prog_scope()
     def func(self, place):
         shape = [2, 3, 7, 9]
@@ -319,6 +332,8 @@ def func(self, place):
 
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.rsqrt_wrapper, [x], y, x_init=x_arr, place=place)
 
     def test_grad(self):
         paddle.enable_static()
@@ -330,6 +345,9 @@ def test_grad(self):
 
 
 class TestSquareDoubleGradCheck(unittest.TestCase):
+    def square_wrapper(self, x):
+        return paddle.square(x[0])
+
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
@@ -344,6 +362,8 @@ def func(self, place):
 
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.square_wrapper, [x], y, x_init=x_arr, place=place)
 
     def test_grad(self):
         paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 58d8610ee352d..7be3b300d55a1 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -2003,6 +2003,7 @@ def setUp(self):
         self.op_type = "celu"
         self.init_dtype()
 
+        self.python_api = paddle.nn.functional.celu
         np.random.seed(1024)
         x = np.random.uniform(-3, 3, [10, 12]).astype(self.dtype)
         alpha = 1.5
@@ -2014,7 +2015,7 @@ def setUp(self):
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_eager=True)
 
 
 class TestCELUAPI(unittest.TestCase):
@@ -2080,6 +2081,11 @@ def test_errors(self):
                 name='x_fp16', shape=[10, 12], dtype='float16')
             self.celu(x_fp16)
 
+    def test_api_eager_dygraph(self):
+        with _test_eager_guard():
+            self.test_dygraph_api()
+            self.test_errors()
+
 
 class TestReciprocal(TestActivation):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py
index 3e2f112e964bb..225bd35a8ec9d 100644
--- a/python/paddle/fluid/tests/unittests/test_adamw_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py
@@ -271,6 +271,115 @@ def test_adamw_op_dygraph(self):
             adam.clear_gradients()
 
 
+class TestAdamWOpMultiPrecison(unittest.TestCase):
+    def _test_adamw_op_dygraph_place_amp(self, place, use_amp=False):
+        paddle.disable_static()
+        paddle.seed(10)
+        paddle.set_device(place)
+
+        input = paddle.randn((5, 5))
+
+        model = paddle.nn.Linear(5, 5)
+
+        optimizer = paddle.optimizer.AdamW(
+            parameters=[{
+                'params': model.parameters(),
+                'weight_decay': 0.001,
+                'beta1': 0.1,
+                'beta2': 0.99
+            }],
+            multi_precision=use_amp)
+
+        for idx in range(2):
+            if place == 'gpu' and use_amp == True:
+                model = paddle.amp.decorate(models=model, level='O2')
+                scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+
+            if place == 'gpu' and use_amp == True:
+                with paddle.amp.auto_cast(level='O2'):
+                    output = model(input)
+                    loss = paddle.mean(output)
+                scaled = scaler.scale(loss)
+                scaled.backward()
+                scaler.step(optimizer)
+                optimizer.clear_grad()
+            else:
+                output = model(input)
+                loss = paddle.mean(output)
+                loss.backward()
+                optimizer.step()
+                optimizer.clear_grad()
+
+    def _get_places(self):
+        places = ['cpu']
+        if paddle.is_compiled_with_cuda():
+            places.append('gpu')
+        return places
+
+    def test_main(self):
+        for place in self._get_places():
+            use_amp_list = [True, False]
+            for use_amp in use_amp_list:
+                self._test_adamw_op_dygraph_place_amp(place, use_amp)
+
+
+class TestAdamWOpError(unittest.TestCase):
+    def test_api_errors(self):
+        def test_weight_decay_dtype():
+            linear = paddle.nn.Linear(13, 5)
+            adam = paddle.optimizer.AdamW(
+                learning_rate=0.01,
+                parameters=linear.parameters(),
+                weight_decay=1)
+
+        def test_parameters_dtype1():
+            adam = paddle.optimizer.AdamW(
+                learning_rate=0.01,
+                parameters=paddle.randn((5, 5)),
+                weight_decay=0.1)
+
+        def test_parameters_dtype2():
+            linear = paddle.nn.Linear(13, 5)
+            adam = paddle.optimizer.AdamW(
+                learning_rate=0.01,
+                parameters={'params': linear.parameters()},
+                weight_decay=0.1)
+
+        def test_parameters_dtype3():
+            adam = paddle.optimizer.AdamW(
+                learning_rate=0.01, parameters=None, weight_decay=0.1)
+
+        def test_parameters_dtype4():
+            linear = paddle.nn.Linear(13, 5)
+            adam = paddle.optimizer.AdamW(
+                learning_rate=0.01,
+                parameters={'params': set(linear.parameters())},
+                weight_decay=0.1)
+
+        def test_learning_rate_dtype():
+            linear = paddle.nn.Linear(13, 5)
+            adam = paddle.optimizer.AdamW(
+                learning_rate=1,
+                parameters=linear.parameters(),
+                weight_decay=0.1)
+
+        def test_grad_clip_dtype():
+            linear = paddle.nn.Linear(13, 5)
+            adam = paddle.optimizer.AdamW(
+                learning_rate=0.01,
+                parameters=linear.parameters(),
+                weight_decay=0.1,
+                grad_clip=0.1)
+
+        self.assertRaises(TypeError, test_weight_decay_dtype)
+        self.assertRaises(TypeError, test_parameters_dtype1)
+        self.assertRaises(TypeError, test_parameters_dtype2)
+        self.assertRaises(AttributeError, test_parameters_dtype3)
+        self.assertRaises(TypeError, test_parameters_dtype4)
+        self.assertRaises(TypeError, test_learning_rate_dtype)
+        self.assertRaises(TypeError, test_grad_clip_dtype)
+
+
 class TestAdamWOpGroupWithLR(TestAdamWOp):
     def test_adamw_op_dygraph(self):
         paddle.disable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_addmm_op.py b/python/paddle/fluid/tests/unittests/test_addmm_op.py
index dcf07f4953200..bea7588acd3d0 100644
--- a/python/paddle/fluid/tests/unittests/test_addmm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_addmm_op.py
@@ -221,7 +221,44 @@ def test_check_grad_input(self):
         self.check_grad(['Input'], 'Out', no_grad_set=None)
 
 
-class TestAddMMOp4(unittest.TestCase):
+class TestAddMMOp4(OpTest):
+    # test broadcast
+    def setUp(self):
+        self.op_type = "addmm"
+        self.dtype = np.float64
+        self.init_dtype_type()
+        self.inputs = {
+            'Input': np.random.random((100)).astype(self.dtype),
+            'X': np.random.random((20, 10)).astype(self.dtype),
+            'Y': np.random.random((10, 100)).astype(self.dtype),
+        }
+        self.attrs = {
+            'Alpha': 0.5,
+            'Beta': 2.0,
+        }
+        self.outputs = {'Out': self.attrs['Beta'] * self.inputs['Input'] + \
+                        self.attrs['Alpha'] * np.dot(self.inputs['X'], self.inputs['Y'])}
+
+    def init_dtype_type(self):
+        pass
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['Input', 'X', 'Y'], 'Out')
+
+    def test_check_grad_x(self):
+        self.check_grad(['X'], 'Out', no_grad_set=None)
+
+    def test_check_grad_y(self):
+        self.check_grad(['Y'], 'Out', no_grad_set=None)
+
+    def test_check_grad_input(self):
+        self.check_grad(['Input'], 'Out', no_grad_set=None)
+
+
+class TestAddMMOp5(unittest.TestCase):
     def test_api_with_dygraph(self):
         np_input = np.random.random((20, 30)).astype(np.float32)
         np_x = np.random.random((20, 6)).astype(np.float32)
@@ -235,7 +272,6 @@ def test_api_with_dygraph(self):
             assert np.allclose(np_input + np.dot(np_x, np_y), out.numpy())
 
 
-'''
 class TestAddMMAPI(unittest.TestCase):
     def test_api_error(self):
         data_x = np.ones((2, 2)).astype(np.float32)
@@ -249,9 +285,106 @@ def test_error1():
             x = paddle.to_tensor(data_x_wrong)
             y = paddle.to_tensor(data_y)
             input = paddle.to_tensor(data_input)
-            out = paddle.tensor.addmm( input=input, x=x, y=y, beta=0.5, alpha=5.0 )
+            out = paddle.tensor.addmm(
+                input=input, x=x, y=y, beta=0.5, alpha=5.0)
+
         self.assertRaises(ValueError, test_error1)
-'''
+
+        def test_error2():
+            data_x_wrong = np.ones((2)).astype(np.float32)
+            x = paddle.to_tensor(data_x_wrong)
+            y = paddle.to_tensor(data_y)
+            input = paddle.to_tensor(data_input)
+            out = paddle.tensor.addmm(
+                input=input, x=x, y=y, beta=0.5, alpha=5.0)
+
+        self.assertRaises(ValueError, test_error2)
+
+        def test_error3():
+            data_input_wrong = np.ones((2, 2, 2)).astype(np.float32)
+            x = paddle.to_tensor(data_x)
+            y = paddle.to_tensor(data_y)
+            input = paddle.to_tensor(data_input_wrong)
+            out = paddle.tensor.addmm(
+                input=input, x=x, y=y, beta=0.5, alpha=5.0)
+
+        self.assertRaises(ValueError, test_error3)
+
+        def test_error4():
+            data_input_wrong = np.ones((5)).astype(np.float32)
+            x = paddle.to_tensor(data_x)
+            y = paddle.to_tensor(data_y)
+            input = paddle.to_tensor(data_input_wrong)
+            out = paddle.tensor.addmm(
+                input=input, x=x, y=y, beta=0.5, alpha=5.0)
+
+        self.assertRaises(ValueError, test_error4)
+
+        paddle.enable_static()
+
+    def test_api_normal_1(self):
+        data_x = np.ones((2, 2)).astype(np.float32)
+        data_y = np.ones((2, 2)).astype(np.float32)
+        data_input = np.ones((2, 2)).astype(np.float32)
+        data_alpha = 0.1
+        data_beta = 1.0
+
+        paddle.disable_static()
+
+        x = paddle.to_tensor(data_x)
+        y = paddle.to_tensor(data_y)
+        input = paddle.to_tensor(data_input)
+        paddle_output = paddle.tensor.addmm(
+            input=input, x=x, y=y, beta=data_beta, alpha=data_alpha)
+        numpy_output = data_beta * data_input + data_alpha * np.dot(data_x,
+                                                                    data_y)
+
+        self.assertEqual(np.allclose(numpy_output, paddle_output.numpy()), True)
+
+        paddle.enable_static()
+
+    def test_api_normal_2(self):
+        data_x = np.ones((3, 10)).astype(np.float32)
+        data_y = np.ones((10, 3)).astype(np.float32)
+        data_input = np.ones((3)).astype(np.float32)
+        data_alpha = 0.1
+        data_beta = 1.0
+
+        paddle.disable_static()
+
+        x = paddle.to_tensor(data_x)
+        y = paddle.to_tensor(data_y)
+        input = paddle.to_tensor(data_input)
+        paddle_output = paddle.tensor.addmm(
+            input=input, x=x, y=y, beta=data_beta, alpha=data_alpha)
+        numpy_output = data_beta * data_input + data_alpha * np.dot(data_x,
+                                                                    data_y)
+
+        self.assertEqual(np.allclose(numpy_output, paddle_output.numpy()), True)
+
+        paddle.enable_static()
+
+    def test_api_normal_3(self):
+        data_x = np.ones((3, 10)).astype(np.float32)
+        data_y = np.ones((10, 3)).astype(np.float32)
+        data_input = np.ones((1)).astype(np.float32)
+        data_alpha = 0.1
+        data_beta = 1.0
+
+        paddle.disable_static()
+
+        x = paddle.to_tensor(data_x)
+        y = paddle.to_tensor(data_y)
+        input = paddle.to_tensor(data_input)
+        paddle_output = paddle.tensor.addmm(
+            input=input, x=x, y=y, beta=data_beta, alpha=data_alpha)
+        numpy_output = data_beta * data_input + data_alpha * np.dot(data_x,
+                                                                    data_y)
+
+        self.assertEqual(np.allclose(numpy_output, paddle_output.numpy()), True)
+
+        paddle.enable_static()
+
 
 if __name__ == "__main__":
     paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_bfgs.py b/python/paddle/fluid/tests/unittests/test_bfgs.py
index 1a12913bc72e9..8a9f9f72aa068 100644
--- a/python/paddle/fluid/tests/unittests/test_bfgs.py
+++ b/python/paddle/fluid/tests/unittests/test_bfgs.py
@@ -22,9 +22,6 @@
 from paddle.incubate.optimizer.functional.bfgs import minimize_bfgs
 from paddle.fluid.framework import _test_eager_guard
 
-from paddle.fluid.framework import _enable_legacy_dygraph
-_enable_legacy_dygraph()
-
 np.random.seed(123)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_collective_api_base.py b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
index 00294bf6071b3..a4e71db3d3850 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_api_base.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
@@ -191,7 +191,8 @@ def check_with_place(self,
                          path_id="0",
                          static_mode="1",
                          check_error_log=False,
-                         need_envs={}):
+                         need_envs={},
+                         eager_mode=True):
         if backend == "nccl" or backend == "bkcl":
             with_gloo = '0'
         else:
@@ -216,6 +217,12 @@ def check_with_place(self,
             required_envs["GLOG_v"] = "3"
             required_envs["GLOG_logtostderr"] = "1"
             required_envs["GLOO_LOG_LEVEL"] = "TRACE"
+
+        if eager_mode:
+            required_envs["FLAGS_enable_eager_mode"] = "%d" % 1
+        else:
+            required_envs["FLAGS_enable_eager_mode"] = "%d" % 0
+
         tr0_out, tr1_out, pid0, pid1 = self._run_cluster(model_file,
                                                          required_envs)
         np.random.seed(pid0)
diff --git a/python/paddle/fluid/tests/unittests/test_collective_global_gather.py b/python/paddle/fluid/tests/unittests/test_collective_global_gather.py
index c9dee529c21a1..6809f3970f683 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_global_gather.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_global_gather.py
@@ -35,7 +35,16 @@ def test_global_gather_nccl_dygraph(self):
             "collective_global_gather_dygraph.py",
             "global_gather",
             "nccl",
-            static_mode="0")
+            static_mode="0",
+            eager_mode=False)
+
+    def test_global_gather_nccl_dygraph_eager(self):
+        self.check_with_place(
+            "collective_global_gather_dygraph.py",
+            "global_gather",
+            "nccl",
+            static_mode="0",
+            eager_mode=True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_collective_global_scatter.py b/python/paddle/fluid/tests/unittests/test_collective_global_scatter.py
index 2b4555de2744d..1485bafa387f5 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_global_scatter.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_global_scatter.py
@@ -35,7 +35,16 @@ def test_global_scatter_nccl_dygraph(self):
             "collective_global_scatter_dygraph.py",
             "global_scatter",
             "nccl",
-            static_mode="0")
+            static_mode="0",
+            eager_mode=False)
+
+    def test_global_scatter_nccl_dygraph_eager(self):
+        self.check_with_place(
+            "collective_global_scatter_dygraph.py",
+            "global_scatter",
+            "nccl",
+            static_mode="0",
+            eager_mode=True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_layer.py b/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
index 707991352fa5e..dd6dcf6d5e9ae 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
@@ -17,6 +17,8 @@
 import paddle.fluid.dygraph as dg
 import paddle.nn.functional as F
 import paddle.fluid.initializer as I
+import paddle
+from paddle.fluid.framework import _test_eager_guard
 import unittest
 
 
@@ -134,7 +136,8 @@ def functional(self, place):
         return y_np
 
     def paddle_nn_layer(self):
-        x_var = dg.to_variable(self.input)
+        x_var = paddle.to_tensor(self.input)
+        x_var.stop_gradient = False
         conv = nn.Conv3D(
             self.num_channels,
             self.num_filters,
@@ -148,17 +151,23 @@ def paddle_nn_layer(self):
         if not self.no_bias:
             conv.bias.set_value(self.bias)
         y_var = conv(x_var)
+        y_var.backward()
         y_np = y_var.numpy()
-        return y_np
+        t1 = x_var.gradient()
+        return y_np, t1
 
     def _test_equivalence(self, place):
         place = fluid.CPUPlace()
         result1 = self.fluid_layer(place)
         result2 = self.functional(place)
         with dg.guard(place):
-            result3 = self.paddle_nn_layer()
+            result3, g1 = self.paddle_nn_layer()
+            with _test_eager_guard():
+                res_eager, g2 = self.paddle_nn_layer()
         np.testing.assert_array_almost_equal(result1, result2)
         np.testing.assert_array_almost_equal(result2, result3)
+        self.assertTrue(np.allclose(result3, res_eager))
+        self.assertTrue(np.allclose(g1, g2))
 
     def runTest(self):
         place = fluid.CPUPlace()
diff --git a/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py b/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py
index 784d89b93f985..5bff8b3142106 100644
--- a/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py
@@ -503,6 +503,75 @@ def test_grad(self):
             self.func(p)
 
 
+class TestDepthWiseConvDoubleGradCheckCase1(unittest.TestCase):
+    def depthwise_conv2d_wrapper(self, x):
+        return paddle.nn.functional.conv2d(x[0], x[1], groups=4)
+
+    @prog_scope()
+    def func(self, place):
+        x_shape = [2, 4, 3, 3]
+        w_shape = [4, 1, 3, 3]
+        eps = 0.005
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
+        x = layers.data('x', x_shape, False, dtype)
+        w = layers.data('w', w_shape, False, dtype)
+
+        # condition of depthwise conv: 
+        # use_cudnn == False
+        # groups == filters
+        # num_filters % num_channels == 0
+
+        y = paddle.nn.functional.conv2d(x, w, groups=4)
+        x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
+        w_arr = np.random.uniform(-1, 1, w_shape).astype(dtype)
+
+        gradient_checker.double_grad_check(
+            [x, w], y, x_init=[x_arr, w_arr], place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.depthwise_conv2d_wrapper, [x, w],
+            y,
+            x_init=[x_arr, w_arr],
+            place=place)
+
+    def test_grad(self):
+        places = []
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestConv3DDoubleGradCheck_NN(unittest.TestCase):
+    def conv3d_wrapper(self, x):
+        return paddle.nn.functional.conv3d(x[0], x[1])
+
+    @prog_scope()
+    def func(self, place):
+        x_shape = [2, 3, 8, 8, 8]
+        w_shape = [6, 3, 3, 3, 3]
+        eps = 0.005
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
+        x = layers.data('x', x_shape, False, dtype)
+        w = layers.data('w', w_shape, False, dtype)
+        x.persistable = True
+        w.persistable = True
+        y = paddle.nn.functional.conv3d(x, w)
+        x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
+        w_arr = np.random.uniform(-1, 1, w_shape).astype(dtype)
+
+        gradient_checker.double_grad_check(
+            [x, w], y, x_init=[x_arr, w_arr], place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.conv3d_wrapper, [x, w], y, x_init=[x_arr, w_arr], place=place)
+
+    def test_grad(self):
+        places = []
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 if __name__ == "__main__":
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py b/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
index 6033b809f218d..14a91b0c2c5fe 100644
--- a/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
@@ -25,6 +25,8 @@
 import paddle.fluid.core as core
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "Only test cuda Random Generator")
 class TestGeneratorSeed(unittest.TestCase):
     """
     Test cases for cpu generator seed.
@@ -70,15 +72,13 @@ def test_generator_gaussian_random_dygraph(self):
         """Test Generator seed."""
         fluid.enable_dygraph()
 
-        paddle.seed(12312321111)
-        x = fluid.layers.gaussian_random([120], dtype="float32")
-        st1 = paddle.get_cuda_rng_state()
-        x1 = fluid.layers.gaussian_random([120], dtype="float32")
-        paddle.set_cuda_rng_state(st1)
-        x2 = fluid.layers.gaussian_random([120], dtype="float32")
-        paddle.seed(12312321111)
-        x3 = fluid.layers.gaussian_random([120], dtype="float32")
-        x_np = x.numpy()
+        st = paddle.get_cuda_rng_state()
+        x1 = paddle.randn([120], dtype="float32")
+        paddle.set_cuda_rng_state(st)
+        x2 = paddle.randn([120], dtype="float32")
+        paddle.set_cuda_rng_state(st)
+        x3 = paddle.randn([120], dtype="float32")
+
         x1_np = x1.numpy()
         x2_np = x2.numpy()
         x3_np = x3.numpy()
@@ -86,7 +86,7 @@ def test_generator_gaussian_random_dygraph(self):
         if core.is_compiled_with_cuda():
             print(">>>>>>> gaussian random dygraph >>>>>>>")
             self.assertTrue(np.allclose(x1_np, x2_np))
-            self.assertTrue(np.allclose(x_np, x3_np))
+            self.assertTrue(np.allclose(x2_np, x3_np))
 
     def test_generator_randint_dygraph(self):
         """Test Generator seed."""
diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
index 600a49b2332be..bb8c6346eb5a5 100644
--- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py
+++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
@@ -279,6 +279,16 @@ def constructor(self, place):
                 "The type of trainable MUST be bool, but the type is /*"):
             eager_param.trainable = "False"
 
+        eager_param_2 = EagerParamBase(
+            shape=paddle.shape(paddle.to_tensor([1, 2, 3, 4])), dtype="float32")
+        self.assertTrue(eager_param_2.trainable)
+        eager_param_2.trainable = False
+        self.assertFalse(eager_param_2.trainable)
+        with self.assertRaisesRegexp(
+                ValueError,
+                "The type of trainable MUST be bool, but the type is /*"):
+            eager_param_2.trainable = "False"
+
     def test_constructor(self):
         print("Test_constructor")
         paddle.set_device("cpu")
diff --git a/python/paddle/fluid/tests/unittests/test_einsum.py b/python/paddle/fluid/tests/unittests/test_einsum.py
index 43b5ce96a3901..26aaf0f44f1d2 100644
--- a/python/paddle/fluid/tests/unittests/test_einsum.py
+++ b/python/paddle/fluid/tests/unittests/test_einsum.py
@@ -18,6 +18,9 @@
 import paddle
 from paddle.fluid import core
 
+import os
+os.environ['FLAGS_new_einsum'] = "0"
+
 
 class TestErrors(unittest.TestCase):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_einsum_op.py b/python/paddle/fluid/tests/unittests/test_einsum_op.py
index 565e43214ea32..1a4ae54afefe2 100644
--- a/python/paddle/fluid/tests/unittests/test_einsum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_einsum_op.py
@@ -34,7 +34,11 @@ def setUp(self):
             self.operands.append(("x" + str(idx), inp))
         self.inputs = {"Operands": self.operands}
         self.attrs = {"equation": self.equation}
-        self.outputs = {'Out': out}
+        self.outputs = {
+            'Out': out,
+            "InnerCache": [('cache_' + str(i), np.array([1.0]))
+                           for i in range(len(self.operands))]
+        }
 
     def init_input(self):
         self.inputs = []
@@ -49,7 +53,7 @@ def set_mandatory(self):
 
     def test_check_output(self):
         if not self.disable:
-            self.check_output()
+            self.check_output(no_check_set=["InnerCache"])
 
     def test_grad(self):
         if not self.disable:
diff --git a/python/paddle/fluid/tests/unittests/test_einsum_v2.py b/python/paddle/fluid/tests/unittests/test_einsum_v2.py
index c58d46edde753..b33a943c9f27e 100644
--- a/python/paddle/fluid/tests/unittests/test_einsum_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_einsum_v2.py
@@ -478,5 +478,23 @@ def test_shape(self):
         self.assertEqual(C.shape, (-1, 384))
 
 
+class TestBF16(unittest.TestCase):
+    """
+    EinsumOp support bfloat16 type, add unittest here for the correctness.
+    """
+
+    def test_shape(self):
+        cuda_major = paddle.version.cuda().split('.')[0].strip()
+        if paddle.is_compiled_with_cuda() and int(cuda_major) >= 11:
+            """ MatmulKernel support bfloat16 only if cuda_major > 11.0.
+            """
+            A = paddle.to_tensor(np.array([1.0, 2.0])).astype(paddle.bfloat16)
+            A = A.cuda()
+            B = paddle.to_tensor(np.array([2.0, 3.0])).astype(paddle.bfloat16)
+            B = B.cuda()
+            C = paddle.einsum('i,i->', A, B)
+            self.assertEqual(C.item(), 8.0)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
index 230bc15e0f1ab..0c8e115d7cebf 100644
--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,439 +15,312 @@
 from __future__ import print_function
 
 import unittest
-import math
+import itertools
 import numpy as np
 import math
 from op_test import OpTest
-import paddle.fluid.core as core
 
 
 # numpy.round has different behavior in comparision to c++ round function
 # so we use round_c instead of numpy.round to align the output data
-def round_c_single_element(x):
-    dtype = type(x)
-    if x >= 0:
-        return dtype(np.floor(x + 0.5))
-    else:
-        return dtype(np.ceil(x - 0.5))
+def round_c_single_element(val):
+    dtype = type(val)
+    if val >= 0:
+        return dtype(np.floor(val + 0.5))
+    return dtype(np.ceil(val - 0.5))
 
 
 round_c = np.vectorize(round_c_single_element)
 
 
-class TestFakeQuantizeOp(OpTest):
-    def setUp(self):
-        self.set_dtype()
-        self.op_type = "fake_quantize_abs_max"
-        self.attrs = {'bit_length': 8}
-        self.inputs = {'X': np.random.random((124, 240)).astype(self.dtype), }
-        scale = np.max(np.abs(self.inputs['X'])).astype(self.dtype)
-        self.outputs = {
-            'Out': round_c(self.inputs['X'] / scale * (
-                (1 << (self.attrs['bit_length'] - 1)) - 1)),
-            'OutScale': np.array(scale).astype(self.dtype),
-        }
-
-    def set_dtype(self):
-        self.dtype = np.float32
+def get_compute_type(dtype):
+    assert dtype in [np.float16, np.float32, np.float64]
+    if dtype == np.float16:
+        return np.float32
+    return dtype
 
-    def test_check_output(self):
-        self.check_output()
 
-
-class TestFakeQuantizeOpFloat16(TestFakeQuantizeOp):
-    def set_dtype(self):
-        self.dtype = np.float16
-
-
-class TestFakeQuantizeOp1(OpTest):
+class TestFakeQuantizeAbsMaxOp(OpTest):
     def setUp(self):
-        self.op_type = "fake_quantize_abs_max"
+        self.op_type = 'fake_quantize_abs_max'
         self.attrs = {'bit_length': 8}
-        self.inputs = {'X': np.zeros((10, 10)).astype("float32"), }
-        scale = np.max(np.abs(self.inputs['X'])).astype("float32")
-        inv_scale = 1.0 / (scale + 1e-6) if scale < 1e-30 else 1.0 / scale
-        self.outputs = {
-            'Out': np.round(self.inputs['X'] * inv_scale * (
-                (1 << (self.attrs['bit_length'] - 1)) - 1)),
-            'OutScale': np.array(scale).astype("float32"),
-        }
-
-    def test_check_output(self):
-        self.check_output()
 
-
-class TestFakeQuantizeOp2(OpTest):
-    def setUp(self):
-        self.op_type = "fake_quantize_abs_max"
-        self.attrs = {'bit_length': 8}
-        self.inputs = {'X': np.full((10, 10), 1e-40).astype("float32"), }
-        scale = np.max(np.abs(self.inputs['X'])).astype("float32")
+    def _fake_quantize_abs_max(self, dtype, input_shape, distribution):
+        input_data = distribution(input_shape).astype(dtype)
+        compute_type = get_compute_type(dtype)
+        scale = np.max(np.abs(input_data))
+        bnt = (1 << (self.attrs['bit_length'] - 1)) - 1
         inv_scale = 1.0 / (scale + 1e-6) if scale < 1e-30 else 1.0 / scale
-        self.outputs = {
-            'Out': np.round(self.inputs['X'] * inv_scale * (
-                (1 << (self.attrs['bit_length'] - 1)) - 1)),
-            'OutScale': np.array(scale).astype("float32"),
-        }
-
-    def test_check_output(self):
+        output_data = round_c(input_data.astype(compute_type) * inv_scale * bnt)
+        self.inputs = {'X': input_data}
+        self.outputs = {'Out': output_data, 'OutScale': scale}
+        self.dtype = dtype
         self.check_output()
 
+    def test_fake_quantize_abs_max(self):
+        self._fake_quantize_abs_max(np.float32, (124, 240), np.random.random)
 
-class TestFakeChannelWiseQuantizeOp(OpTest):
-    def setUp(self):
-        self.set_dtype()
-        self.set_arg()
-        assert self.quant_axis in [0, 1], "quant_axis should be 0 or 1."
+    def test_fake_quantize_abs_max_float16(self):
+        self._fake_quantize_abs_max(np.float16, (124, 240), np.random.random)
 
-        self.op_type = "fake_channel_wise_quantize_abs_max"
-        self.attrs = {'bit_length': 8, 'quant_axis': self.quant_axis}
+    def test_fake_quantize_abs_max_underflow(self):
+        self._fake_quantize_abs_max(np.float32, (10, 10), np.zeros)
 
-        scales = []
-        outputs = self.inputs['X'].copy()
-        bnt = (1 << (self.attrs['bit_length'] - 1)) - 1
-        if self.quant_axis == 0:
-            for i in range(self.inputs['X'].shape[0]):
-                scale_v = np.max(np.abs(self.inputs['X'][i])).astype(self.dtype)
-                scales.append(scale_v)
-                outputs[i] = round_c(
-                    self.dtype(bnt) * (self.dtype(1.0) / scale_v) * outputs[i])
-        elif self.quant_axis == 1:
-            for i in range(self.inputs['X'].shape[1]):
-                scale_v = np.max(np.abs(self.inputs['X'][:, i])).astype(
-                    self.dtype)
-                scales.append(scale_v)
-                outputs[:, i] = round_c(
-                    self.dtype(bnt) * (self.dtype(1.0) / scale_v) *
-                    outputs[:, i])
-
-        self.outputs = {
-            'Out': outputs,
-            'OutScale': np.array(scales).astype(self.dtype),
-        }
+    def test_fake_quantize_abs_max_underflow2(self):
+        self._fake_quantize_abs_max(np.float32, (10, 10),
+                                    lambda shape: np.full(shape, 1e-40))
 
-    def set_arg(self):
-        self.quant_axis = 0
-        self.inputs = {
-            'X': np.random.random((20, 15, 6, 6)).astype(self.dtype),
-        }
 
-    def set_dtype(self):
-        self.dtype = np.float32
+class TestFakeChannelWiseQuantizeAbsMaxOp(OpTest):
+    def setUp(self):
+        self.op_type = 'fake_channel_wise_quantize_abs_max'
+        self.attrs = {'bit_length': 8}
 
-    def test_check_output(self):
+    def _fake_channel_wise_quantize_abs_max(self, dtype, input_shape,
+                                            quant_axis, distribution):
+        assert quant_axis in [0, 1], 'quant_axis should be 0 or 1.'
+        input_data = distribution(input_shape).astype(dtype)
+        compute_type = get_compute_type(dtype)
+        bnt = (1 << (self.attrs['bit_length'] - 1)) - 1
+        compute_axis = tuple(
+            i for i in range(len(input_shape)) if i != quant_axis)
+        scale_broadcast = np.amax(input_data, axis=compute_axis, keepdims=True)
+        output_data = round_c(bnt * input_data.astype(compute_type) /
+                              scale_broadcast)
+        if quant_axis == 1:
+            scale_broadcast = np.transpose(scale_broadcast,
+                                           (1, ) + compute_axis)
+        scale = scale_broadcast.reshape(input_shape[quant_axis], -1)[:, 0]
+        self.inputs = {'X': input_data}
+        self.outputs = {'Out': output_data, 'OutScale': scale}
+        self.dtype = dtype
+        self.attrs['quant_axis'] = quant_axis
         self.check_output()
 
-
-class TestFakeChannelWiseQuantizeOpFloat16(TestFakeChannelWiseQuantizeOp):
-    def set_dtype(self):
-        self.dtype = np.float16
-
-
-class TestFakeChannelWiseQuantizeOp1(TestFakeChannelWiseQuantizeOp):
-    def set_quant_axis(self):
-        self.quant_axis = 1
-        self.inputs = {
-            'X': np.random.random((15, 20, 5, 5)).astype(self.dtype),
-        }
-
-
-class TestFakeChannelWiseQuantizeOp1Float16(TestFakeChannelWiseQuantizeOp1):
-    def set_dtype(self):
-        self.dtype = np.float16
-
-
-class TestFakeChannelWiseQuantizeOp2(TestFakeChannelWiseQuantizeOp):
-    def set_quant_axis(self):
-        self.quant_axis = 0
-        self.inputs = {'X': np.random.random((30, 15)).astype(self.dtype), }
-
-
-class TestFakeChannelWiseQuantizeOp3(TestFakeChannelWiseQuantizeOp):
-    def set_quant_axis(self):
-        self.quant_axis = 1
-        self.inputs = {'X': np.random.random((30, 15)).astype(self.dtype), }
+    def test_fake_channel_wise_quantize_abs_max(self):
+        dtype_options = [np.float32, np.float16]
+        input_shape_quant_axis_options = [[(20, 15, 6, 6), 0],
+                                          [(15, 20, 5, 5), 1], [(30, 15), 0],
+                                          [(30, 15), 1]]
+        for dtype, input_shape_quant_axis in itertools.product(
+                dtype_options, input_shape_quant_axis_options):
+            input_shape, quant_axis = input_shape_quant_axis
+            with self.subTest(
+                    dtype=dtype, input_shape=input_shape,
+                    quant_axis=quant_axis):
+                self._fake_channel_wise_quantize_abs_max(
+                    dtype, input_shape, quant_axis, np.random.random)
 
 
 class TestFakeQuantizeRangeAbsMaxOp(OpTest):
     def setUp(self):
-        self.set_dtype()
-        self.op_type = "fake_quantize_range_abs_max"
-        self.attrs = {
-            'bit_length': int(5),
-            'window_size': int(1),
-            'is_test': False
-        }
-        x = (np.random.random((8, 16, 7, 7)) - 0.5) * 10
-        x = x.astype(self.dtype)
+        self.op_type = 'fake_quantize_range_abs_max'
+        self.attrs = {'bit_length': 5, 'window_size': 1}
+
+    def _fake_quantize_range_abs_max(self,
+                                     dtype,
+                                     input_shape,
+                                     distribution,
+                                     is_test=False):
+        input_data = distribution(input_shape).astype(dtype)
+        compute_type = get_compute_type(dtype)
+        bnt = (1 << (self.attrs['bit_length'] - 1)) - 1
+        in_scale = np.zeros(1).astype(dtype)
+        out_scale = np.zeros(self.attrs['window_size']).astype(dtype)
+        out_scale[0] = np.max(np.abs(input_data))
+        if is_test:
+            out_scale[0] = in_scale[0] = out_scale[0] - 1.0
+            clip_data = np.clip(input_data, -in_scale, in_scale)
+        else:
+            clip_data = input_data
+        output_data = round_c(
+            clip_data.astype(compute_type) / out_scale[0] * bnt)
         self.inputs = {
-            'X': x,
-            'Iter': np.zeros(1).astype("int64"),
-            'InScale': np.zeros(1).astype(self.dtype)
+            'X': input_data,
+            'Iter': np.zeros(1).astype(np.int64),
+            'InScale': in_scale
         }
-        scale = np.max(np.abs(self.inputs['X'])).astype(self.dtype)
-
-        out_scales = np.zeros(self.attrs['window_size']).astype(self.dtype)
-        out_scales[0] = scale
         self.outputs = {
-            'Out': round_c(
-                self.dtype((1 << (self.attrs['bit_length'] - 1)) - 1) *
-                (self.dtype(1.0) / scale) * self.inputs['X']),
-            'OutScale': scale,
-            'OutScales': out_scales,
+            'Out': output_data,
+            'OutScale': out_scale[0],
+            'OutScales': out_scale
         }
-
-    def set_dtype(self):
-        self.dtype = np.float32
-
-    def test_check_output(self):
+        self.dtype = dtype
+        self.attrs['is_test'] = is_test
         self.check_output()
 
-
-class TestFakeQuantizeRangeAbsMaxOpFloat16(TestFakeQuantizeRangeAbsMaxOp):
-    def set_dtype(self):
-        self.dtype = np.float16
+    def test_fake_quantize_range_abs_max(self):
+        dtype_options = [np.float32, np.float16]
+        is_test_options = [False, True]
+        for dtype, is_test in itertools.product(dtype_options, is_test_options):
+            self.attrs['bit_length'] = 8 if is_test else 5
+            with self.subTest(dtype=dtype, is_test=is_test):
+                self._fake_quantize_range_abs_max(
+                    dtype, (8, 16, 7, 7),
+                    lambda shape: (np.random.random(shape) - 0.5) * 10,
+                    is_test=is_test)
 
 
 class TestMovingAverageAbsMaxScaleOp(OpTest):
     def setUp(self):
-        self.op_type = "moving_average_abs_max_scale"
+        self.op_type = 'moving_average_abs_max_scale'
         self.attrs = {'moving_rate': float(0.9), 'is_test': False}
-        accum = np.zeros(1).astype("float32")
-        accum[0] = 1
-        state = np.zeros(1).astype("float32")
-        state[0] = 1
-        x = np.random.random((8, 16, 7, 7)).astype("float32")
-        self.inputs = {
-            'X': x,
-            'InAccum': accum,
-            'InState': state,
-        }
 
-        out = x
-        out_accum = np.zeros(1).astype("float32")
-        out_state = np.zeros(1).astype("float32")
-        out_scale = np.zeros(1).astype("float32")
-        out_accum[0] = self.attrs['moving_rate'] * accum[0] + np.max(
-            np.abs(self.inputs['X'])).astype("float32")
-        out_state[0] = self.attrs['moving_rate'] * state[0] + 1
+    def _moving_average_abs_max_scale(self, dtype, input_shape, distribution):
+        input_data = distribution(input_shape).astype(dtype)
+        in_accum = np.ones(1).astype(dtype)
+        in_state = np.ones(1).astype(dtype)
+        out_accum = self.attrs['moving_rate'] * in_accum[0] + np.max(
+            np.abs(input_data))
+        out_state = self.attrs['moving_rate'] * in_state[0] + 1.0
         out_scale = out_accum / out_state
+        self.inputs = {
+            'X': input_data,
+            'InAccum': in_accum,
+            'InState': in_state
+        }
         self.outputs = {
-            'Out': out,
+            'Out': input_data,
             'OutAccum': out_accum,
             'OutState': out_state,
-            'OutScale': out_scale,
+            'OutScale': out_scale
         }
-
-    def test_check_output(self):
+        self.dtype = dtype
         self.check_output()
 
+    def test_moving_average_abs_max(self):
+        self._moving_average_abs_max_scale(np.float32, (8, 16, 7, 7),
+                                           np.random.random)
 
-class TestFakeQuantizeRangeAbsMaxOp2(OpTest):
-    def setUp(self):
-        self.set_dtype()
-        self.op_type = "fake_quantize_range_abs_max"
-        self.attrs = {
-            'bit_length': int(8),
-            'window_size': int(1),
-            'is_test': True
-        }
-        x = (np.random.random((8, 16, 7, 7)) - 0.5) * 10
-        x = x.astype(self.dtype)
-        scale = np.array([np.max(np.abs(x)).astype(self.dtype) - 1.0])
-        out_scales = np.zeros(self.attrs['window_size']).astype(self.dtype)
-        out_scales[0] = scale.astype(self.dtype)
-        self.inputs = {
-            'X': x,
-            'Iter': np.zeros(1).astype("int64"),
-            'InScale': scale.astype(self.dtype)
-        }
-        xs = np.clip(x, -scale, scale).astype(self.dtype)
-        qs = round_c(
-            self.dtype(
-                self.dtype((1 << (self.attrs['bit_length'] - 1)) - 1) * (
-                    self.dtype(1.0) / scale) * xs))
-        self.outputs = {
-            'Out': qs,
-            'OutScale': scale.astype(self.dtype),
-            'OutScales': out_scales,
-        }
-
-    def set_dtype(self):
-        self.dtype = np.float32
-
-    def test_check_output(self):
-        self.check_output(no_check_set=set(['OutScale', 'OutScales']))
-
-
-class TestFakeQuantizeRangeAbsMaxOp2Float16(TestFakeQuantizeRangeAbsMaxOp2):
-    def set_dtype(self):
-        self.dtype = np.float16
 
-
-class TestMovingOpBase(OpTest):
+class TestFakeQuantizeMovingAverageAbsMaxOp(OpTest):
     def setUp(self):
-        self.set_dtype()
-        self.init_type()
-        self.attrs = {
-            'bit_length': int(5),
-            'moving_rate': float(0.9),
-            'is_test': False
-        }
-        accum = np.zeros(1).astype(self.dtype)
-        accum[0] = 1
-        state = np.zeros(1).astype(self.dtype)
-        state[0] = self.dtype(1.0)
-        scale = np.zeros(1).astype(self.dtype)
-        scale[0] = 0.001
+        self.op_type = 'fake_quantize_moving_average_abs_max'
+        self.attrs = {'bit_length': 5, 'moving_rate': 0.9, 'is_test': False}
+
+    def _fake_quantize_moving_average_abs_max(self,
+                                              dtype,
+                                              input_shape,
+                                              distribution,
+                                              dequantize=False,
+                                              with_gradient=False):
+        input_data = distribution(input_shape).astype(dtype)
+        compute_type = get_compute_type(dtype)
+        bnt = (1 << (self.attrs['bit_length'] - 1)) - 1
+        in_accum = np.ones(1).astype(dtype)
+        in_state = np.ones(1).astype(dtype)
+        in_scale = np.array([0.001]).astype(dtype)
+        out_accum = np.zeros(1).astype(dtype)
+        out_state = np.zeros(1).astype(dtype)
+        out_scale = np.zeros(1).astype(dtype)
+        out_accum[0] = self.attrs['moving_rate'] * in_accum[0] + np.max(
+            np.abs(input_data))
+        out_state[0] = self.attrs['moving_rate'] * in_state[0] + 1.0
+        out_scale = out_accum / out_state
+        round_data = round_c(input_data.astype(compute_type) / out_scale * bnt)
+        if dequantize:
+            output_data = (round_data * out_scale / bnt).astype(dtype)
+            self.op_type = 'fake_quantize_dequantize_moving_average_abs_max'
+        else:
+            output_data = round_data.astype(dtype)
         self.inputs = {
-            'X': np.random.random((8, 16, 7, 7)).astype(self.dtype),
-            'InScale': scale,
-            'InAccum': accum,
-            'InState': state,
+            'X': input_data,
+            'InScale': in_scale,
+            'InAccum': in_accum,
+            'InState': in_state
         }
-
-        out_accum = np.zeros(1).astype(self.dtype)
-        out_state = np.zeros(1).astype(self.dtype)
-        out_scale = np.zeros(1).astype(self.dtype)
-        out_accum[0] = self.dtype(self.attrs['moving_rate']) * self.dtype(accum[
-            0]) + np.max(np.abs(self.inputs['X'])).astype(self.dtype)
-        out_state[0] = self.dtype(self.attrs['moving_rate']) * self.dtype(state[
-            0]) + self.dtype(1.0)
-        out_scale = self.dtype(self.dtype(out_accum) / self.dtype(out_state))
-        out_data = self.calc_output(out_scale)
         self.outputs = {
-            'Out': out_data,
+            'Out': output_data,
             'OutAccum': out_accum,
             'OutState': out_state,
-            'OutScale': out_scale,
+            'OutScale': out_scale
         }
-
-    def set_dtype(self):
-        self.dtype = np.float32
-
-    def init_type(self):
-        self.op_type = "fake_quantize_moving_average_abs_max"
-
-    def calc_output(self, out_scale):
-        return round_c(self.inputs['X'] / out_scale * (
-            (1 << (self.attrs['bit_length'] - 1)) - 1))
-
-    def test_check_output(self):
+        self.dtype = dtype
         self.check_output()
+        if with_gradient:
+            gradient = [
+                np.ones(input_data.shape) / np.product(input_data.shape)
+            ]
+            self.check_grad(['X'], 'Out', user_defined_grads=gradient)
 
+    def test_fake_quantize_moving_average_abs_max(self):
+        self._fake_quantize_moving_average_abs_max(np.float32, (8, 16, 7, 7),
+                                                   np.random.random)
 
-class TestMovingOpBaseFloat16(TestMovingOpBase):
-    def set_dtype(self):
-        self.dtype = np.float16
-
-    def test_check_output(self):
-        self.check_output(atol=1e-2)
+    def test_fake_quantize_moving_average_abs_max_float16(self):
+        self._fake_quantize_moving_average_abs_max(np.float16, (8, 16, 7, 7),
+                                                   np.random.random)
 
+    def test_fake_quantize_dequantize_moving_average_abs_max(self):
+        self._fake_quantize_moving_average_abs_max(
+            np.float32, (8, 16, 7, 7),
+            np.random.random,
+            dequantize=True,
+            with_gradient=True)
 
-class TestFakeQuantDequantMovingOp(TestMovingOpBase):
-    def init_type(self):
-        self.op_type = "fake_quantize_dequantize_moving_average_abs_max"
 
-    def calc_output(self, out_scale):
-        range_v = (1 << (self.attrs['bit_length'] - 1)) - 1
-        return np.round(self.inputs['X'] / out_scale *
-                        range_v) * out_scale / range_v
-
-    def test_check_grad(self):
-        x = self.inputs["X"]
-        gradient = [np.ones(x.shape) / np.product(x.shape)]
-        self.check_grad(["X"], "Out", user_defined_grads=gradient)
-
-
-class TestFakeQuantDequantAbsOp(OpTest):
+class TestFakeQuantizeDequantizeAbsMaxOp(OpTest):
     def setUp(self):
-        self.op_type = "fake_quantize_dequantize_abs_max"
+        self.op_type = 'fake_quantize_dequantize_abs_max'
         self.attrs = {'bit_length': 8}
-        self.inputs = {'X': np.random.random((124, 240)).astype("float32"), }
-        scale = np.max(np.abs(self.inputs['X'])).astype("float32")
-        out_data = self.calc_output(scale)
+
+    def _fake_quantize_dequantize_abs_max(self, dtype, input_shape,
+                                          distribution):
+        input_data = distribution(input_shape).astype(dtype)
+        scale = np.max(np.abs(input_data)).astype(dtype)
+        bnt = (1 << (self.attrs['bit_length'] - 1)) - 1
+        output_data = round_c(input_data / scale * bnt) * scale / bnt
+        self.inputs = {'X': input_data}
         self.outputs = {
-            'Out': out_data,
-            'OutScale': np.array(scale).astype("float32"),
+            'Out': output_data,
+            'OutScale': np.array(scale).astype(dtype)
         }
-
-    def calc_output(self, scale):
-        range_v = (1 << (self.attrs['bit_length'] - 1)) - 1
-        return np.round(self.inputs['X'] / scale * range_v) * scale / range_v
-
-    def test_check_output(self):
+        self.dtype = dtype
         self.check_output()
+        gradient = [np.ones(input_data.shape) / np.product(input_data.shape)]
+        self.check_grad(['X'], 'Out', user_defined_grads=gradient)
 
-    def test_check_grad(self):
-        x = self.inputs["X"]
-        gradient = [np.ones(x.shape) / np.product(x.shape)]
-        self.check_grad(["X"], "Out", user_defined_grads=gradient)
+    def test_fake_quantize_dequantize_abs_max(self):
+        self._fake_quantize_dequantize_abs_max(np.float32, (124, 240),
+                                               np.random.random)
 
 
-class TestChannelWiseFakeQuantDequantOp(OpTest):
+class TestChannelWiseFakeQuantizeDequantizeAbsMaxOp(OpTest):
     def setUp(self):
-        self.set_arg()
-        assert self.quant_axis in [0, 1], "quant_axis should be 0 or 1."
-
-        self.op_type = "fake_channel_wise_quantize_dequantize_abs_max"
-        self.attrs = {'bit_length': 8, 'quant_axis': self.quant_axis}
-
-        scales = []
-        outputs = self.inputs['X'].copy()
-        range_v = (1 << (self.attrs['bit_length'] - 1)) - 1
-        if self.quant_axis == 0:
-            for i in range(self.inputs['X'].shape[0]):
-                scale_v = np.max(np.abs(self.inputs['X'][i])).astype("float32")
-                scales.append(scale_v)
-                outputs[i] = np.round(outputs[i] * range_v /
-                                      scale_v) * scale_v / range_v
-        elif self.quant_axis == 1:
-            for i in range(self.inputs['X'].shape[1]):
-                scale_v = np.max(np.abs(self.inputs['X'][:, i])).astype(
-                    "float32")
-                scales.append(scale_v)
-                outputs[:, i] = np.round(outputs[:, i] * range_v /
-                                         scale_v) * scale_v / range_v
-
-        self.outputs = {
-            'Out': outputs,
-            'OutScale': np.array(scales).astype("float32"),
-        }
-
-    def set_arg(self):
-        self.quant_axis = 0
-        self.inputs = {
-            'X': np.random.random((3, 4, 64, 64)).astype("float32"),
-        }
+        self.op_type = 'fake_channel_wise_quantize_dequantize_abs_max'
+        self.attrs = {'bit_length': 8}
 
-    def test_check_output(self):
+    def _fake_channel_wise_quantize_dequantize_abs_max(
+            self, dtype, input_shape, quant_axis, distribution):
+        assert quant_axis in [0, 1], 'quant_axis should be 0 or 1.'
+        input_data = distribution(input_shape).astype(dtype)
+        compute_type = get_compute_type(dtype)
+        bnt = (1 << (self.attrs['bit_length'] - 1)) - 1
+        output_data = input_data.copy().astype(compute_type)
+        compute_axis = tuple(
+            i for i in range(len(input_shape)) if i != quant_axis)
+        scale_broadcast = np.amax(input_data, axis=compute_axis, keepdims=True)
+        output_data = round_c(bnt * output_data /
+                              scale_broadcast) * scale_broadcast / bnt
+        if quant_axis == 1:
+            scale_broadcast = np.transpose(scale_broadcast,
+                                           (1, ) + compute_axis)
+        scale = scale_broadcast.reshape(input_shape[quant_axis], -1)[:, 0]
+        self.inputs = {'X': input_data}
+        self.outputs = {'Out': output_data, 'OutScale': scale}
+        self.dtype = dtype
+        self.attrs['quant_axis'] = quant_axis
         self.check_output()
+        gradient = [np.ones(input_data.shape) / np.product(input_data.shape)]
+        self.check_grad(['X'], 'Out', user_defined_grads=gradient)
 
-    def test_check_grad(self):
-        x = self.inputs["X"]
-        gradient = [np.ones(x.shape) / np.product(x.shape)]
-        self.check_grad(["X"], "Out", user_defined_grads=gradient)
-
-
-class TestChannelWiseFakeQuantDequantOp1(TestChannelWiseFakeQuantDequantOp):
-    def set_arg(self):
-        self.quant_axis = 1
-        self.inputs = {
-            'X': np.random.random((15, 20, 5, 5)).astype("float32"),
-        }
-
-
-class TestChannelWiseFakeQuantDequantOp2(TestChannelWiseFakeQuantDequantOp):
-    def set_arg(self):
-        self.quant_axis = 0
-        self.inputs = {'X': np.random.random((30, 15)).astype("float32"), }
-
-
-class TestChannelWiseFakeQuantDequantOp3(TestChannelWiseFakeQuantDequantOp):
-    def set_arg(self):
-        self.quant_axis = 1
-        self.inputs = {'X': np.random.random((30, 15)).astype("float32"), }
+    def test_channel_wise_fake_quant_dequant_abs_max(self):
+        input_shape_quant_axis_options = [[(3, 4, 64, 64), 0], [(
+            15, 20, 5, 5), 1], [(30, 15), 0], [(30, 15), 1]]
+        for input_shape, quant_axis in input_shape_quant_axis_options:
+            with self.subTest(input_shape=input_shape, quant_axis=quant_axis):
+                self._fake_channel_wise_quantize_dequantize_abs_max(
+                    np.float32, input_shape, quant_axis, np.random.random)
 
 
 def quantize_max_abs(x, max_range):
@@ -589,5 +462,5 @@ def test_check_output(self):
         self.check_output()
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
index 0ae005430e03b..28e03fdfd70e1 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/fluid/tests/unittests/test_fused_attention_op.py b/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
index 67160f59952ef..445620f9e1cb1 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
@@ -36,6 +36,18 @@ class TestFusedAttentionOp(OpTest):
     def setUp(self):
         self.config()
         self.generate_input_data()
+
+        self.rtol = 1e-5
+        # FIXME(limin29): Because there is a problem with the test precision
+        #  on A100, atol is temporarily set to 1e-2, and it will be
+        #  changed back after the precision problem is solved.
+        self.atol = 1e-2
+        # make sure local development precision
+        if "V100" in paddle.device.cuda.get_device_name():
+            self.atol = 1e-4
+        if self.x_type is np.float16:
+            self.atol = 1e-1
+
         paddle.set_default_dtype(self.x_type)
         self.__class__.op_type = "fused_attention"
         # use autograd to check grad in this unittest.
@@ -274,9 +286,9 @@ def test_fused_attention_op(self):
         final_out_ref, x_grad_ref = self.GetBaselineOut()
         final_out, x_grad = self.GetFusedAttentionOut()
         np.testing.assert_allclose(
-            final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-4)
+            final_out_ref, final_out.numpy(), rtol=self.rtol, atol=self.atol)
         np.testing.assert_allclose(
-            x_grad_ref, x_grad.numpy(), rtol=1e-5, atol=1e-4)
+            x_grad_ref, x_grad.numpy(), rtol=self.rtol, atol=self.atol)
 
 
 class TestFusedAttentionOpBiasIsNone(TestFusedAttentionOp):
@@ -307,9 +319,9 @@ def test_fused_attention_op(self):
         final_out_ref, x_grad_ref = self.GetBaselineOut()
         final_out, x_grad = self.GetFusedAttentionOut()
         np.testing.assert_allclose(
-            final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-1)
+            final_out_ref, final_out.numpy(), rtol=self.rtol, atol=self.atol)
         np.testing.assert_allclose(
-            x_grad_ref, x_grad.numpy(), rtol=1e-5, atol=1e-1)
+            x_grad_ref, x_grad.numpy(), rtol=self.rtol, atol=self.atol)
 
 
 class TestFusedAttentionOpCacheKV(TestFusedAttentionOp):
@@ -325,7 +337,10 @@ def test_fused_attention_op(self):
             final_out_ref = self.GetBaselineOut()
             final_out, cache_kv_out = self.GetFusedAttentionOut()
             np.testing.assert_allclose(
-                final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-4)
+                final_out_ref,
+                final_out.numpy(),
+                rtol=self.rtol,
+                atol=self.atol)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py b/python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py
index bdaf32ee0726d..74dc9351a25b4 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py
@@ -173,6 +173,17 @@ def setUp(self):
         self.config()
         self.generate_input_data()
 
+        self.rtol = 1e-5
+        # FIXME(limin29): Because there is a problem with the test precision
+        #  on A100, atol is temporarily set to 1e-2, and it will be
+        #  changed back after the precision problem is solved.
+        self.atol = 1e-2
+        # make sure local development precision
+        if "V100" in paddle.device.cuda.get_device_name():
+            self.atol = 1e-4
+        if self.x_type is np.float16:
+            self.atol = 1e-1
+
     def setAttnMask(self):
         self.has_attn_mask = True
 
@@ -256,7 +267,8 @@ def run_imperative(self):
             fused_attn.ln_scale.numpy(), fused_attn_ln_bias,
             fused_attn.qkv_weight.numpy(), fused_attn_qkv_bias,
             fused_attn.linear_weight.numpy(), fused_attn_linear_bias)
-        np.testing.assert_allclose(ref_out, out.numpy(), rtol=1e-5, atol=1e-4)
+        np.testing.assert_allclose(
+            ref_out, out.numpy(), rtol=self.rtol, atol=self.atol)
 
     def run_static(self):
         fused_attn = FusedMultiHeadAttention(
@@ -341,7 +353,7 @@ def test_static_api(self):
                                     self.attn_mask, ln_scale, ln_bias,
                                     ln_2_scale, ln_2_bias, qkv_weight, qkv_bias,
                                     linear_weight, linear_bias)
-        np.testing.assert_allclose(ref_out, out, rtol=1e-5, atol=1e-4)
+        np.testing.assert_allclose(ref_out, out, rtol=self.rtol, atol=self.atol)
 
     def test_dynamic_api(self):
         paddle.disable_static(place=paddle.CUDAPlace(0))
diff --git a/python/paddle/fluid/tests/unittests/test_fused_bias_dropout_residual_layer_norm_op.py b/python/paddle/fluid/tests/unittests/test_fused_bias_dropout_residual_layer_norm_op.py
new file mode 100644
index 0000000000000..d47450837a455
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fused_bias_dropout_residual_layer_norm_op.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+import paddle.fluid.core as core
+import paddle.nn.functional as F
+import paddle.incubate.nn.functional as incubate_f
+from paddle.nn.layer.norm import LayerNorm
+from paddle.nn.layer.common import Linear, Dropout
+from paddle.nn.layer.transformer import _convert_attention_mask
+from paddle import tensor
+from paddle.fluid import layers
+import unittest
+from op_test import OpTest
+from paddle.fluid.framework import default_main_program
+
+default_main_program().random_seed = 42
+
+
+class TestFusedBiasDropoutResidualLayerNormOp(OpTest):
+    def setUp(self):
+        self.config()
+        self.generate_input_data()
+        paddle.set_default_dtype(self.x_type)
+        self.__class__.op_type = "fused_bias_dropout_residual_layer_norm"
+        # use autograd to check grad in this unittest.
+        self.__class__.no_need_check_grad = True
+        paddle.set_default_dtype(np.float32)
+        self.norm1 = LayerNorm(self.embed_dim)
+        paddle.set_default_dtype(self.x_type)
+        self.dropout = Dropout(self.dropout_prob, mode="upscale_in_train")
+
+    def config(self):
+        self.x_type = np.float32
+        self.atol = 1e-4
+        self.training = True
+        self.batch_size = 8
+        self.query_length = 128
+        self.embed_dim = 1024
+        self.dropout_prob = 0.0
+        self.weight_attr = None
+        self.bias_attr = None
+
+    def generate_input_data(self):
+        self.x = np.random.rand(self.batch_size, self.query_length,
+                                self.embed_dim).astype(self.x_type)
+        self.residual = np.random.rand(self.batch_size, self.query_length,
+                                       self.embed_dim).astype(self.x_type)
+        self.linear_bias = np.random.rand(self.embed_dim).astype(self.x_type)
+        self.dout = np.random.random((self.batch_size, self.query_length,
+                                      self.embed_dim)).astype(self.x_type)
+
+        if self.bias_attr is False:
+            self.tensor_linear_bias = None
+        else:
+            self.tensor_linear_bias = paddle.to_tensor(
+                self.linear_bias, stop_gradient=False)
+
+        self.tensor_x = paddle.to_tensor(self.x, stop_gradient=False)
+        self.tensor_residual = paddle.to_tensor(
+            self.residual, stop_gradient=False)
+
+    def GetBaselineOut(self):
+        paddle.disable_static(place=paddle.CUDAPlace(0))
+
+        if self.tensor_linear_bias is not None:
+            out = self.tensor_x + self.tensor_linear_bias
+        else:
+            out = self.tensor_x
+
+        residual_out = self.tensor_residual + self.dropout(out)
+        final_out = self.norm1(residual_out)
+
+        paddle.autograd.backward(
+            [final_out], [paddle.to_tensor(self.dout)], retain_graph=True)
+
+        if self.tensor_linear_bias is not None:
+            tensor_linear_bias_grad = self.tensor_linear_bias.grad
+        else:
+            tensor_linear_bias_grad = None
+        return final_out, self.tensor_x.grad, self.tensor_residual.grad, tensor_linear_bias_grad
+
+    def GetFusedBiasDropoutResidualLayerNormOut(self):
+        paddle.disable_static(place=paddle.CUDAPlace(0))
+
+        ln_scale = paddle.to_tensor(self.norm1.weight, stop_gradient=False)
+        ln_bias = paddle.to_tensor(self.norm1.bias, stop_gradient=False)
+        epsilon = 1e-05
+
+        final_out = incubate_f.fused_bias_dropout_residual_layer_norm(
+            self.tensor_x, self.tensor_residual, self.tensor_linear_bias,
+            ln_scale, ln_bias, self.dropout_prob, epsilon)
+
+        paddle.autograd.backward(
+            [final_out], [paddle.to_tensor(self.dout)], retain_graph=True)
+        if self.tensor_linear_bias is not None:
+            tensor_linear_bias_grad = self.tensor_linear_bias.grad
+        else:
+            tensor_linear_bias_grad = None
+        return final_out, self.tensor_x.grad, self.tensor_residual.grad, tensor_linear_bias_grad
+
+    def test_fused_op(self):
+        out_ref, x_grad_ref, residual_grad_ref, linear_bias_grad_ref = self.GetBaselineOut(
+        )
+        out, x_grad, residual_grad, linear_bias_grad = self.GetFusedBiasDropoutResidualLayerNormOut(
+        )
+        np.testing.assert_allclose(
+            out_ref, out.numpy(), rtol=1e-5, atol=self.atol)
+        np.testing.assert_allclose(
+            x_grad_ref, x_grad.numpy(), rtol=1e-5, atol=self.atol)
+        np.testing.assert_allclose(
+            residual_grad_ref, residual_grad.numpy(), rtol=1e-5, atol=self.atol)
+        if linear_bias_grad_ref is not None:
+            np.testing.assert_allclose(
+                linear_bias_grad_ref,
+                linear_bias_grad.numpy(),
+                rtol=1e-5,
+                atol=self.atol)
+
+
+class TestFusedBiasDropoutResidualLayerNormOpBiasIsNone(
+        TestFusedBiasDropoutResidualLayerNormOp):
+    def config(self):
+        super().config()
+        self.bias_attr = False
+
+
+class TestFusedBiasDropoutResidualLayerNormOpFp16(
+        TestFusedBiasDropoutResidualLayerNormOp):
+    def config(self):
+        super().config()
+        self.x_type = np.float16
+        self.atol = 1e-1
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_bias_dropout_residual_layer_norm_op_api.py b/python/paddle/fluid/tests/unittests/test_fused_bias_dropout_residual_layer_norm_op_api.py
new file mode 100644
index 0000000000000..19fc3972e58d4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fused_bias_dropout_residual_layer_norm_op_api.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+import paddle.fluid.core as core
+import paddle.nn.functional as F
+from paddle.incubate.nn.layer.fused_transformer import FusedBiasDropoutResidualLayerNorm
+from paddle import tensor
+from paddle.fluid import layers
+from paddle.static import Program, program_guard
+import unittest
+
+
+def layer_norm(x, has_scale, has_bias, weight, bias, epsilon=1e-05):
+    batch_size, src_len, d_model = x.shape
+    x = x.reshape((batch_size * src_len, d_model))
+    mu = np.mean(x, axis=1, keepdims=True)
+    sigma_squar = np.sum(np.square(x - mu), axis=1) / d_model
+    x1_up = (x - mu)
+    x1_down_1 = sigma_squar + epsilon
+    x1_down = np.sqrt(x1_down_1)
+    x1_down = x1_down.reshape((x1_down.shape[0], 1))
+    x1 = x1_up / x1_down
+    x_scaled = x1
+    if (has_scale):
+        x_scaled = weight * x1
+    x_scaled_bias = x_scaled
+    if (has_bias):
+        x_scaled_bias = x_scaled + bias
+    x_scaled_bias = x_scaled_bias.reshape((batch_size, src_len, d_model))
+    return x_scaled_bias
+
+
+def compute_reference(x, residual, ln_scale, ln_bias, linear_bias):
+    batch_size = x.shape[0]
+    seq_len = x.shape[1]
+    embed_dim = x.shape[2]
+
+    has_bias = True
+    if ln_bias is None:
+        has_bias = False
+    # bias add, dropout, residual add, layer_norm.
+    if linear_bias is not None:
+        linear_bias_out = x + linear_bias
+    else:
+        linear_bias_out = x
+    linear_bias_dropout_out = linear_bias_out
+    linear_bias_dropout_residual_out = residual + linear_bias_dropout_out
+    linear_bias_dropout_residual_ln_out = layer_norm(
+        linear_bias_dropout_residual_out, True, has_bias, ln_scale, ln_bias)
+    return linear_bias_dropout_residual_ln_out
+
+
+class TestFusedBiasDropoutResidualLayerNormAPI(unittest.TestCase):
+    def setUp(self):
+        self.setXType()
+        self.setBiasAttr()
+        self.config()
+        self.generate_input_data()
+
+    def setBiasAttr(self):
+        self.bias_attr = None
+
+    def setXType(self):
+        self.x_type = np.float32
+        self.atol = 1e-4
+
+    def config(self):
+        self.training = True
+        self.batch_size = 1
+        self.query_length = 2
+        self.embed_dim = 4
+        self.dropout_prob = 0.0
+        self.weight_attr = None
+
+    def generate_input_data(self):
+        self.x = np.random.rand(self.batch_size, self.query_length,
+                                self.embed_dim).astype(self.x_type)
+        self.residual = np.random.rand(self.batch_size, self.query_length,
+                                       self.embed_dim).astype(self.x_type)
+
+    def run_imperative(self):
+        fused_bias_dropout_residual_ln = FusedBiasDropoutResidualLayerNorm(
+            self.embed_dim, self.dropout_prob, self.weight_attr, self.bias_attr)
+
+        linear_bias = None
+        if self.bias_attr is not False:
+            linear_bias = np.random.random(fused_bias_dropout_residual_ln.
+                                           linear_bias.shape).astype('float32')
+            fused_bias_dropout_residual_ln.linear_bias.set_value(
+                paddle.to_tensor(linear_bias))
+        out = fused_bias_dropout_residual_ln(
+            paddle.to_tensor(self.x), paddle.to_tensor(self.residual))
+
+        ln_bias = None
+        if self.bias_attr is not False:
+            ln_bias = fused_bias_dropout_residual_ln.ln_bias.numpy()
+        ln_scale = fused_bias_dropout_residual_ln.ln_scale.numpy(),
+        ref_out = compute_reference(self.x, self.residual, ln_scale, ln_bias,
+                                    linear_bias)
+        np.testing.assert_allclose(
+            ref_out, out.numpy(), rtol=1e-5, atol=self.atol)
+
+    def run_static(self):
+        fused_op = FusedBiasDropoutResidualLayerNorm(
+            self.embed_dim, self.dropout_prob, self.weight_attr, self.bias_attr)
+
+        x = paddle.static.data(
+            name='X',
+            shape=[self.batch_size, self.query_length, self.embed_dim],
+            dtype=self.x_type)
+        residual = paddle.static.data(
+            name='Residual',
+            shape=[self.batch_size, self.query_length, self.embed_dim],
+            dtype=self.x_type)
+        final_out = fused_op(x, residual)
+
+        place = paddle.CUDAPlace(0)
+        exe = paddle.static.Executor(place)
+        exe.run(paddle.static.default_startup_program())
+
+        linear_bias = None
+        ln_bias = None
+        if self.bias_attr is False:
+            out, ln_scale = exe.run(
+                paddle.static.default_main_program(),
+                feed={"X": self.x,
+                      "Residual": self.residual},
+                fetch_list=[final_out, fused_op.ln_scale])
+        else:
+            out, linear_bias, ln_scale, ln_bias = exe.run(
+                paddle.static.default_main_program(),
+                feed={"X": self.x,
+                      "Residual": self.residual},
+                fetch_list=[
+                    final_out, fused_op.linear_bias, fused_op.ln_scale,
+                    fused_op.ln_bias
+                ])
+        return out, linear_bias, ln_scale, ln_bias
+
+    def test_static_api(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(Program()):
+            out, linear_bias, ln_scale, ln_bias = self.run_static()
+        ref_out = compute_reference(self.x, self.residual, ln_scale, ln_bias,
+                                    linear_bias)
+        np.testing.assert_allclose(ref_out, out, rtol=1e-5, atol=self.atol)
+
+    def test_dynamic_api(self):
+        paddle.disable_static(place=paddle.CUDAPlace(0))
+        self.run_imperative()
+
+
+class TestFusedBiasDropoutResidualLayerNormAPIBiasIsNone(
+        TestFusedBiasDropoutResidualLayerNormAPI):
+    def setBiasAttr(self):
+        self.bias_attr = False
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py b/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py
index 8c68eb243aea8..25336efd6a7fb 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py
@@ -40,7 +40,12 @@ def getShape(self):
 
     def getDiff(self):
         self.rtol = 1e-3
-        self.atol = 1e-4
+        # FIXME(limin29): Because there is a problem with the test precision
+        #  on A100, atol is temporarily set to 1e-2, and it will be
+        #  changed back after the precision problem is solved.
+        self.atol = 1e-2
+        if "V100" in paddle.device.cuda.get_device_name():
+            self.atol = 1e-4
 
     def getActivation(self):
         self.act_method = "gelu"
diff --git a/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py b/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py
new file mode 100644
index 0000000000000..6f9ba5f5e4e57
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py
@@ -0,0 +1,252 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+from paddle import tensor
+import unittest
+from op_test import OpTest, convert_float_to_uint16
+from test_sparse_attention_op import get_cuda_version
+from paddle import _C_ops
+from paddle.fluid.framework import default_main_program
+from paddle.fluid import core
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "Paddle is not compiled with CUDA")
+class TestFusedGateAttentionOp(OpTest):
+    def setUp(self):
+        self.__class__.op_type = "fused_gate_attention"
+        # use autograd to check grad in this unittest.
+        self.__class__.no_need_check_grad = True
+        self.config()
+        self.merge_qkv = self.q_dim == self.kv_dim
+        self.generate_input_data()
+
+    def config(self):
+        self.dtype = "float32"
+        self.has_gating = True
+        self.batch_size = 1
+        self.msa_len = 3
+        self.res_len = 5
+        self.q_dim = 6
+        self.num_heads = 2
+        self.key_dim = 4
+        self.m_size = self.res_len
+        self.kv_dim = self.q_dim
+        self.out_dim = self.q_dim
+        self.bias_attr = True
+
+    def generate_input_data(self):
+        def _random(shape):
+            if self.dtype == "bfloat16":
+                data = np.random.random(shape).astype("float32")
+                return convert_float_to_uint16(data)
+            else:
+                return np.random.random(shape).astype(self.dtype)
+
+        np.random.seed(123)
+        self.query = _random(
+            (self.batch_size, self.msa_len, self.res_len, self.q_dim))
+        self.q_weight = _random((self.q_dim, self.num_heads, self.key_dim))
+        self.k_weight = _random((self.kv_dim, self.num_heads, self.key_dim))
+        self.v_weight = _random((self.kv_dim, self.num_heads, self.key_dim))
+        if self.merge_qkv:
+            self.key = None
+            # (3, self.num_heads, self.key_dim, self.q_dim)
+            q_weight_t = np.transpose(self.q_weight, axes=[1, 2, 0])
+            k_weight_t = np.transpose(self.k_weight, axes=[1, 2, 0])
+            v_weight_t = np.transpose(self.v_weight, axes=[1, 2, 0])
+            self.qkv_weight = np.stack([q_weight_t, k_weight_t, v_weight_t])
+        else:
+            self.key = _random(
+                (self.batch_size, self.msa_len, self.m_size, self.kv_dim))
+            self.qkv_weight = None
+
+        self.attn_mask = _random(
+            (self.batch_size, self.msa_len, 1, 1, self.m_size))
+
+        if self.bias_attr:
+            self.nonbatched_bias = _random(
+                (self.batch_size, 1, self.num_heads, self.res_len, self.m_size))
+
+        if self.has_gating:
+            self.gating_w = _random((self.q_dim, self.num_heads, self.key_dim))
+            self.gating_b = _random((self.num_heads, self.key_dim))
+
+        self.output_w = _random((self.num_heads, self.key_dim, self.out_dim))
+        self.output_b = _random((self.out_dim))
+
+        self.dout = _random(
+            (self.batch_size, self.msa_len, self.res_len, self.q_dim))
+
+    def get_reference_out(self):
+        paddle.disable_static(place=paddle.CUDAPlace(0))
+
+        query = paddle.to_tensor(self.query, stop_gradient=False)
+        key = query if self.merge_qkv else paddle.to_tensor(
+            self.key, stop_gradient=False)
+        q_weight = paddle.to_tensor(self.q_weight, stop_gradient=False)
+        k_weight = paddle.to_tensor(self.k_weight, stop_gradient=False)
+        v_weight = paddle.to_tensor(self.v_weight, stop_gradient=False)
+        src_mask = paddle.to_tensor(self.attn_mask, stop_gradient=True)
+
+        c = self.key_dim**(-0.5)
+        # [batch_size, msa_len, num_heads, res_len, key_dim]
+        q = paddle.einsum('nbqa,ahc->nbqhc', query, q_weight) * c
+        # [batch_size, msa_len, num_heads, m_size, key_dim]
+        k = paddle.einsum('nbka,ahc->nbkhc', key, k_weight)
+        # [batch_size, msa_len, num_heads, m_size, key_dim]
+        v = paddle.einsum('nbka,ahc->nbkhc', key, v_weight)
+
+        # [batch_size, msa_len, num_heads, res_len, m_size] 
+        logits = paddle.einsum('nbqhc,nbkhc->nbhqk', q, k)  # qk_out
+        logits = logits + src_mask
+        if self.bias_attr:
+            nonbatched_bias = paddle.to_tensor(
+                self.nonbatched_bias, stop_gradient=False)
+            logits = logits + nonbatched_bias
+
+        weights = nn.functional.softmax(logits)  # softmax_out
+        weighted_avg = paddle.einsum('nbhqk,nbkhc->nbqhc', weights, v)
+
+        if self.has_gating:
+            gating_w = paddle.to_tensor(self.gating_w, stop_gradient=False)
+            gating_b = paddle.to_tensor(self.gating_b, stop_gradient=False)
+            gate_values = paddle.einsum('nbqc,chv->nbqhv', query,
+                                        gating_w) + gating_b
+            gate_values = nn.functional.sigmoid(gate_values)
+            weighted_avg = weighted_avg * gate_values
+
+        output_b = paddle.to_tensor(self.output_b, stop_gradient=False)
+        output_w = paddle.to_tensor(self.output_w, stop_gradient=False)
+
+        out = paddle.einsum('nbqhc,hco->nbqo', weighted_avg,
+                            output_w) + output_b
+        paddle.autograd.backward(
+            [out], [paddle.to_tensor(self.dout)], retain_graph=True)
+        if self.merge_qkv:
+            return out, query.grad, None
+        else:
+            return out, query.grad, key.grad
+
+    def get_fused_gate_attention_out(self):
+        paddle.disable_static(place=paddle.CUDAPlace(0))
+
+        query = paddle.to_tensor(self.query, stop_gradient=False)
+        if self.merge_qkv:
+            key = None
+            q_weight = None
+            k_weight = None
+            v_weight = None
+            qkv_weight = paddle.to_tensor(self.qkv_weight, stop_gradient=False)
+        else:
+            key = paddle.to_tensor(self.key, stop_gradient=False)
+            q_weight = paddle.to_tensor(self.q_weight, stop_gradient=False)
+            k_weight = paddle.to_tensor(self.k_weight, stop_gradient=False)
+            v_weight = paddle.to_tensor(self.v_weight, stop_gradient=False)
+            qkv_weight = None
+
+        src_mask = paddle.to_tensor(self.attn_mask, stop_gradient=True)
+
+        if self.bias_attr:
+            nonbatched_bias = paddle.to_tensor(
+                self.nonbatched_bias, stop_gradient=False)
+        else:
+            nonbatched_bias = None
+        if self.has_gating:
+            gating_w = paddle.to_tensor(self.gating_w, stop_gradient=False)
+            gating_b = paddle.to_tensor(self.gating_b, stop_gradient=False)
+        else:
+            gating_w = None
+            gating_b = None
+
+        output_w = paddle.to_tensor(self.output_w, stop_gradient=False)
+        output_b = paddle.to_tensor(self.output_b, stop_gradient=False)
+
+        _, _, _, _, _, _, _, out = _C_ops.fused_gate_attention(
+            query, key, q_weight, k_weight, v_weight, qkv_weight,
+            nonbatched_bias, src_mask, gating_w, gating_b, output_w, output_b,
+            'has_gating', self.has_gating, 'merge_qkv', self.merge_qkv)
+
+        paddle.autograd.backward(
+            [out], [paddle.to_tensor(self.dout)], retain_graph=True)
+        if key is not None:
+            return out, query.grad, key.grad
+        else:
+            return out, query.grad, None
+
+    def check_output_and_grad(self, atol, rtol):
+        out_ref, query_grad_ref, key_grad_ref = self.get_reference_out()
+        out, query_grad, key_grad = self.get_fused_gate_attention_out()
+        np.testing.assert_allclose(out_ref, out.numpy(), atol=atol, rtol=rtol)
+        np.testing.assert_allclose(
+            query_grad_ref, query_grad.numpy(), atol=atol, rtol=rtol)
+        if key_grad_ref is not None and key_grad is not None:
+            np.testing.assert_allclose(
+                key_grad_ref, key_grad.numpy(), atol=atol, rtol=rtol)
+
+    def test_output_and_grad(self):
+        self.check_output_and_grad(atol=1e-5, rtol=1e-5)
+
+
+class TestSeparatedQKVCase(TestFusedGateAttentionOp):
+    def config(self):
+        self.dtype = "float32"
+        self.has_gating = False
+        self.batch_size = 1
+        self.msa_len = 3
+        self.res_len = 5
+        self.q_dim = 6
+        self.num_heads = 2
+        self.key_dim = 4
+        self.m_size = 4
+        self.kv_dim = 2
+        self.out_dim = self.q_dim
+        self.bias_attr = False
+
+
+class TestMergeQKVNoBiasGatingCase(TestFusedGateAttentionOp):
+    def config(self):
+        super().config()
+        self.has_gating = False
+        self.bias_attr = False
+
+
+class TestMergeQKVFp16Case(TestFusedGateAttentionOp):
+    def config(self):
+        super().config()
+        self.dtype = "float16"
+
+    def test_output_and_grad(self):
+        self.check_output_and_grad(atol=1e-1, rtol=1e-5)
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda() or get_cuda_version() < 11000,
+    "core is not compiled with CUDA and cuda version need larger than or equal to 11.3"
+)
+class TestMergeQKVBF16Case(TestFusedGateAttentionOp):
+    def config(self):
+        super().config()
+        self.dtype = "bfloat16"
+
+    def test_output_and_grad(self):
+        self.check_output_and_grad(atol=1e-1, rtol=1e-3)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_transformer_encoder_layer.py b/python/paddle/fluid/tests/unittests/test_fused_transformer_encoder_layer.py
index 7dc86d0dea382..843b495e85b9a 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_transformer_encoder_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_transformer_encoder_layer.py
@@ -49,6 +49,14 @@ def setUp(self):
         self.setPreLayerNorm()
         self.setAttnMask()
 
+        self.rtol = 1e-3
+        # FIXME(limin29): Because there is a problem with the test precision
+        #  on A100, atol is temporarily set to 1e-2, and it will be
+        #  changed back after the precision problem is solved.
+        self.atol = 1e-2
+        if "V100" in paddle.device.cuda.get_device_name():
+            self.atol = 1e-4
+
     def fused_weight(self, weight, num_head):
         a = paddle.transpose(weight, perm=[1, 0])
         return paddle.reshape(
@@ -151,13 +159,13 @@ def test_out(self):
         self.assertTrue(fused_encoder.fused_attn.extra_repr(), correct_attn_str)
 
         np.testing.assert_allclose(
-            fused_out.numpy(), base_out.numpy(), rtol=1e-3, atol=1e-4)
+            fused_out.numpy(), base_out.numpy(), rtol=self.rtol, atol=self.atol)
         self.assertTrue(
             np.allclose(
                 fused_out.grad.numpy(),
                 base_out.grad.numpy(),
-                rtol=1e-3,
-                atol=1e-4))
+                rtol=self.rtol,
+                atol=self.atol))
 
 
 class TestFusedTransformerEncoderLayerAct(TestFusedTransformerEncoderLayer):
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
index cad6437d1d3e3..21844c9e402ad 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
@@ -48,6 +48,7 @@ def test_main(self):
 class TestLookupTableOp(OpTest):
     def setUp(self):
         self.op_type = "lookup_table_v2"
+        self.python_api = paddle.nn.functional.embedding
         table = np.random.random((17, 31)).astype("float64")
         ids = np.random.randint(0, 17, 4).astype(self.id_dtype())
         self.inputs = {'W': table, 'Ids': ids}
@@ -57,10 +58,10 @@ def id_dtype(self):
         return "int64"
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'))
+        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'), check_eager=True)
 
 
 class TestLookupTableOpInt16(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/test_nanmedian.py b/python/paddle/fluid/tests/unittests/test_nanmedian.py
new file mode 100644
index 0000000000000..2e1f13a8c7d9f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_nanmedian.py
@@ -0,0 +1,196 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+
+np.random.seed(102)
+
+
+class TestNanmedian(unittest.TestCase):
+    def setUp(self):
+        single_axis_shape = (120)
+        multi_axis_shape = (2, 3, 4, 5)
+
+        self.fake_data = {
+            "single_axis_normal":
+            np.random.uniform(-1, 1, single_axis_shape).astype(np.float32),
+            "multi_axis_normal":
+            np.random.uniform(-1, 1, multi_axis_shape).astype(np.float32),
+            "single_axis_all_nan": np.full(single_axis_shape, np.nan),
+            "multi_axis_all_nan": np.full(multi_axis_shape, np.nan),
+        }
+
+        single_partial_nan = self.fake_data["single_axis_normal"].copy()
+        single_partial_nan[single_partial_nan > 0] = np.nan
+        multi_partial_nan = self.fake_data["multi_axis_normal"].copy()
+        multi_partial_nan[multi_partial_nan > 0] = np.nan
+        self.fake_data["single_axis_partial_nan"] = single_partial_nan
+        self.fake_data["multi_axis_partial_nan"] = multi_partial_nan
+
+        row_data = np.random.uniform(-1, 1, multi_axis_shape).astype(np.float32)
+        row_data[:, :, :, 0] = np.nan
+        row_data[:, :, :2, 1] = np.nan
+        row_data[:, :, 2:, 2] = np.nan
+        self.fake_data["row_nan_even"] = row_data
+        self.fake_data["row_nan_float64"] = row_data.astype(np.float64)
+        self.fake_data["row_nan_int64"] = row_data.astype(np.int64)
+        self.fake_data["row_nan_int32"] = row_data.astype(np.int32)
+
+        col_data = np.random.uniform(-1, 1, multi_axis_shape).astype(np.float32)
+        col_data[:, :, 0, :] = np.nan
+        col_data[:, :, 1, :3] = np.nan
+        col_data[:, :, 2, 3:] = np.nan
+        self.fake_data["col_nan_odd"] = col_data
+
+        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+        self.axis_candiate_list = [
+            None, 0, 2, -1, -2, (1, 2), [0, -1], [0, 1, 3], (1, 2, 3),
+            [0, 2, 1, 3]
+        ]
+
+    def test_api_static(self):
+        data = self.fake_data["col_nan_odd"]
+        paddle.enable_static()
+        np_res = np.nanmedian(data, keepdims=True)
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.fluid.data('X', data.shape)
+            out1 = paddle.nanmedian(x, keepdim=True)
+            out2 = paddle.tensor.nanmedian(x, keepdim=True)
+            out3 = paddle.tensor.stat.nanmedian(x, keepdim=True)
+            axis = np.arange(len(data.shape)).tolist()
+            out4 = paddle.nanmedian(x, axis=axis, keepdim=True)
+            out5 = paddle.nanmedian(x, axis=tuple(axis), keepdim=True)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': data},
+                          fetch_list=[out1, out2, out3, out4, out5])
+
+        for out in res:
+            self.assertTrue(np.allclose(np_res, out, equal_nan=True))
+
+    def test_api_dygraph(self):
+        paddle.disable_static(self.place)
+
+        def clean_axis_numpy(axis, shape_len):
+            if isinstance(axis, tuple):
+                axis = list(axis)
+            if isinstance(axis, list):
+                for k in range(len(axis)):
+                    if axis[k] < 0:
+                        axis[k] += shape_len
+                axis = set(axis)
+            return axis
+
+        def test_data_case(data):
+            for keep_dim in [False, True]:
+                if np.isnan(data).all() and keep_dim:
+                    np_ver = np.version.version.split('.')
+                    if int(np_ver[0]) < 1 or int(np_ver[1]) <= 20:
+                        print(
+                            "This numpy version does not support all nan elements when keepdim is True"
+                        )
+                        continue
+
+                np_res = np.nanmedian(data, keepdims=keep_dim)
+                pd_res = paddle.nanmedian(
+                    paddle.to_tensor(data), keepdim=keep_dim)
+                self.assertTrue(
+                    np.allclose(
+                        np_res, pd_res.numpy(), equal_nan=True))
+
+        def test_axis_case(data, axis):
+            pd_res = paddle.nanmedian(
+                paddle.to_tensor(data), axis=axis, keepdim=False)
+            axis = clean_axis_numpy(axis, len(data.shape))
+            np_res = np.nanmedian(data, axis=axis, keepdims=False)
+            self.assertTrue(np.allclose(np_res, pd_res.numpy(), equal_nan=True))
+
+        for name, data in self.fake_data.items():
+            test_data_case(data)
+
+        for axis in self.axis_candiate_list:
+            test_axis_case(self.fake_data["row_nan_even"], axis)
+            test_axis_case(self.fake_data["col_nan_odd"], axis)
+
+        paddle.enable_static()
+
+    def test_errors(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.fluid.data("X", [10, 12])
+
+            def test_dtype():
+                x2 = paddle.fluid.data('X2', [10, 12], 'bool')
+                paddle.nanmedian(x2)
+
+            def test_empty_axis():
+                paddle.nanmedian(x, axis=[], keepdim=True)
+
+            def test_axis_not_in_range():
+                paddle.nanmedian(x, axis=3, keepdim=True)
+
+            def test_duplicated_axis():
+                paddle.nanmedian(x, axis=[1, -1], keepdim=True)
+
+            self.assertRaises(TypeError, test_dtype)
+            self.assertRaises(ValueError, test_empty_axis)
+            self.assertRaises(ValueError, test_axis_not_in_range)
+            self.assertRaises(ValueError, test_duplicated_axis)
+
+    def test_dygraph(self):
+        paddle.disable_static(place=self.place)
+        with paddle.fluid.dygraph.guard():
+            data = self.fake_data["col_nan_odd"]
+            out = paddle.nanmedian(paddle.to_tensor(data), keepdim=True)
+        np_res = np.nanmedian(data, keepdims=True)
+        self.assertTrue(np.allclose(np_res, out, equal_nan=True))
+        paddle.enable_static()
+
+    def test_check_grad(self):
+        paddle.disable_static(place=self.place)
+        shape = (4, 5)
+        x_np = np.random.uniform(-1, 1, shape).astype(np.float64)
+        x_np[0, :] = np.nan
+        x_np[1, :3] = np.nan
+        x_np[2, 3:] = np.nan
+        x_np_sorted = np.sort(x_np)
+        nan_counts = np.count_nonzero(np.isnan(x_np).astype(np.int32), axis=1)
+        np_grad = np.zeros((shape))
+        for i in range(shape[0]):
+            valid_cnts = shape[1] - nan_counts[i]
+            if valid_cnts == 0:
+                continue
+
+            mid = int(valid_cnts / 2)
+            targets = [x_np_sorted[i, mid]]
+            is_odd = valid_cnts % 2
+            if not is_odd and mid > 0:
+                targets.append(x_np_sorted[i, mid - 1])
+            for j in range(shape[1]):
+                if x_np[i, j] in targets:
+                    np_grad[i, j] = 1 if is_odd else 0.5
+
+        x_tensor = paddle.to_tensor(x_np, stop_gradient=False)
+        y = paddle.nanmedian(x_tensor, axis=1, keepdim=True)
+        dx = paddle.grad(y, x_tensor)[0].numpy()
+        self.assertTrue(np.allclose(np_grad, dx, equal_nan=True))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_nn_grad.py b/python/paddle/fluid/tests/unittests/test_nn_grad.py
index 3a100cd321e03..4685b00b394b7 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_grad.py
@@ -215,6 +215,10 @@ def test_grad(self):
 
 
 class TestSqueezeDoubleGradCheck(unittest.TestCase):
+    def squeeze_warpper(self, x):
+        axes = [0, 2]
+        return paddle.squeeze(x[0], axes)
+
     @prog_scope()
     def func(self, place):
         x_shape = [1, 3, 1, 40]
@@ -229,6 +233,8 @@ def func(self, place):
 
         gradient_checker.double_grad_check(
             [x], out, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.squeeze_warpper, [x], out, x_init=x_arr, place=place)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -239,6 +245,10 @@ def test_grad(self):
 
 
 class TestUnsqueezeDoubleGradCheck(unittest.TestCase):
+    def unsqueeze_wrapper(self, x):
+        axes = [1, 2]
+        return paddle.unsqueeze(x[0], axes)
+
     @prog_scope()
     def func(self, place):
         x_shape = [3, 40]
@@ -253,6 +263,8 @@ def func(self, place):
 
         gradient_checker.double_grad_check(
             [x], out, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.unsqueeze_wrapper, [x], out, x_init=x_arr, place=place)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -333,6 +345,10 @@ def test_grad(self):
 
 
 class TestConstantPadDoubleGradCheck(unittest.TestCase):
+    def pad_wrapper(self, x):
+        pad = [1, 1, 1, 1]
+        return paddle.nn.functional.pad(x[0], pad)
+
     @prog_scope()
     def func(self, place):
         x_shape = [2, 3, 4, 5]
@@ -347,6 +363,8 @@ def func(self, place):
 
         gradient_checker.double_grad_check(
             [x], out, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.pad_wrapper, [x], out, x_init=x_arr, place=place)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
diff --git a/python/paddle/fluid/tests/unittests/test_onnx_export.py b/python/paddle/fluid/tests/unittests/test_onnx_export.py
index 5efd586d849d1..07016d4290102 100644
--- a/python/paddle/fluid/tests/unittests/test_onnx_export.py
+++ b/python/paddle/fluid/tests/unittests/test_onnx_export.py
@@ -21,7 +21,7 @@
 import paddle
 from paddle.static import InputSpec
 
-from paddle.fluid.framework import in_dygraph_mode
+from paddle.fluid.framework import in_dygraph_mode, _test_eager_guard
 
 
 class LinearNet(paddle.nn.Layer):
@@ -45,43 +45,46 @@ def forward(self, x, y, z):
 
 
 class TestExportWithTensor(unittest.TestCase):
-    def setUp(self):
+    def func_with_tensor(self):
         self.x_spec = paddle.static.InputSpec(
             shape=[None, 128], dtype='float32')
-
-    def test_with_tensor(self):
-        if in_dygraph_mode():
-            return
         model = LinearNet()
         paddle.onnx.export(model, 'linear_net', input_spec=[self.x_spec])
 
+    def test_with_tensor(self):
+        with _test_eager_guard():
+            self.func_with_tensor()
+        self.func_with_tensor()
+
 
 class TestExportWithTensor1(unittest.TestCase):
-    def setUp(self):
+    def func_with_tensor(self):
         self.x = paddle.to_tensor(np.random.random((1, 128)))
-
-    def test_with_tensor(self):
-        if in_dygraph_mode():
-            return
         model = LinearNet()
         paddle.onnx.export(model, 'linear_net', input_spec=[self.x])
 
+    def test_with_tensor(self):
+        with _test_eager_guard():
+            self.func_with_tensor()
+        self.func_with_tensor()
+
 
 class TestExportPrunedGraph(unittest.TestCase):
-    def setUp(self):
+    def func_prune_graph(self):
+        model = Logic()
         self.x = paddle.to_tensor(np.array([1]))
         self.y = paddle.to_tensor(np.array([-1]))
-
-    def test_prune_graph(self):
-        if in_dygraph_mode():
-            return
-        model = Logic()
         paddle.jit.to_static(model)
         out = model(self.x, self.y, z=True)
         paddle.onnx.export(
             model, 'pruned', input_spec=[self.x], output_spec=[out])
 
+    def test_prune_graph(self):
+        # test eager
+        with _test_eager_guard():
+            self.func_prune_graph()
+        self.func_prune_graph()
+
 
 if __name__ == '__main__':
-    if not in_dygraph_mode():
-        unittest.main()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sharding_parallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sharding_parallel.py
index e12d1826f286c..503bd9d0f9797 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sharding_parallel.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sharding_parallel.py
@@ -25,8 +25,7 @@ class TestHybridParallel(TestMultipleGpus):
 
     # check sharding logic as well as the accuracy with single mode
     def test_hybrid_parallel_sharding_logic(self):
-        # self.run_mnist_2gpu(
-        #     'hybrid_parallel_sharding_model.py')
+        self.run_mnist_2gpu('hybrid_parallel_sharding_model.py')
         self.run_mnist_2gpu(
             'hybrid_parallel_sharding_model.py', eager_mode=False)
 
diff --git a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
index dacb7a5b59957..3621fd1b9d445 100644
--- a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
@@ -629,7 +629,6 @@ def _calc_output(self, place, mode="test", dygraph=True):
         else:
             fluid.disable_dygraph()
         gen = paddle.seed(self._random_seed)
-        gen._is_init_py = False
         paddle.framework.random._manual_program_seed(self._random_seed)
         scope = fluid.core.Scope()
         with fluid.scope_guard(scope):
diff --git a/python/paddle/fluid/tests/unittests/test_rrelu_op.py b/python/paddle/fluid/tests/unittests/test_rrelu_op.py
new file mode 100644
index 0000000000000..9d33ce085b7f7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_rrelu_op.py
@@ -0,0 +1,326 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from op_test import OpTest
+import paddle
+import paddle.nn.functional as F
+from paddle.fluid import dygraph
+
+paddle.seed(102)
+np.random.seed(102)
+
+
+def ref_rrelu(x, lower, upper):
+    x_t = x.copy()
+    alpha = (lower + upper) / 2.0
+    return np.where(x_t <= 0, alpha * x_t, x_t)
+
+
+def ref_rrelu_nn(x, lower, upper):
+    return ref_rrelu(x, lower, upper)
+
+
+def check_output(input, output, lower, upper):
+    lower_res = np.where(input <= 0, lower * input, input)
+    upper_res = np.where(input <= 0, upper * input, input)
+    return (output <= lower_res).all() and (output >= upper_res).all()
+
+
+class TestFunctionalRReluAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = np.random.uniform(-1., 1., [1, 2, 3, 4]).astype('float64')
+        self.lower_0 = 0.05
+        self.lower_1 = 0.1
+        self.upper_0 = 0.25
+        self.upper_1 = 0.33
+
+        self.places = [
+            fluid.CUDAPlace(0)
+            if core.is_compiled_with_cuda() else fluid.CPUPlace()
+        ]
+
+    def check_static_result(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(
+                name="input", shape=[2, 3, 4, 5], dtype="float32")
+            res1 = F.rrelu(
+                x=input, lower=self.lower_0, upper=self.upper_0, training=False)
+            res2 = F.rrelu(
+                x=input, lower=self.lower_1, upper=self.upper_1, training=False)
+            in_np = np.random.uniform(-1., 1., [2, 3, 4, 5]).astype("float32")
+
+            res_np1 = ref_rrelu(in_np, self.lower_0, self.upper_0)
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": in_np},
+                              fetch_list=[res1])
+
+            self.assertTrue(np.allclose(fetches[0], res_np1))
+
+            res_np2 = ref_rrelu(in_np, self.lower_1, self.upper_1)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": in_np},
+                              fetch_list=[res2])
+            self.assertTrue(np.allclose(fetches[0], res_np2))
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_static_graph_functional(self):
+        '''test_static_graph_functional'''
+
+        for place in self.places:
+            paddle.enable_static()
+            x_1 = paddle.fluid.data(
+                name="x", shape=self.x_np.shape, dtype="float64")
+            x_2 = paddle.fluid.data(
+                name="x2", shape=self.x_np.shape, dtype="float64")
+            out_1 = F.rrelu(x_1, self.lower_0, self.upper_0, training=False)
+            out_2 = F.rrelu(x_2, self.lower_1, self.upper_1, training=False)
+            out_3 = F.rrelu(x_2, self.lower_1, self.upper_1, training=True)
+
+            exe = paddle.static.Executor(place=place)
+            res_1 = exe.run(fluid.default_main_program(),
+                            feed={"x": self.x_np},
+                            fetch_list=out_1,
+                            use_prune=True)
+            res_2 = exe.run(fluid.default_main_program(),
+                            feed={"x2": self.x_np},
+                            fetch_list=out_2,
+                            use_prune=True)
+            res_3 = exe.run(fluid.default_main_program(),
+                            feed={"x2": self.x_np},
+                            fetch_list=out_3,
+                            use_prune=True)
+
+            out_ref_1 = ref_rrelu(self.x_np, self.lower_0, self.upper_0)
+            out_ref_2 = ref_rrelu(self.x_np, self.lower_1, self.upper_1)
+            self.assertEqual(np.allclose(out_ref_1, res_1), True)
+            self.assertEqual(np.allclose(out_ref_2, res_2), True)
+            self.assertTrue(
+                check_output(self.x_np, res_3[0], self.lower_1, self.upper_1))
+
+    def test_static_graph_layer(self):
+        '''test_static_graph_layer'''
+
+        for place in self.places:
+            paddle.enable_static()
+            x_1 = paddle.fluid.data(
+                name="x", shape=self.x_np.shape, dtype="float64")
+            x_2 = paddle.fluid.data(
+                name="x2", shape=self.x_np.shape, dtype="float64")
+            # init instance
+            rrelu_1 = paddle.nn.RReLU(self.lower_0, self.upper_0)
+            rrelu_2 = paddle.nn.RReLU(self.lower_1, self.upper_1)
+            out_1 = rrelu_1(x_1)
+            out_2 = rrelu_2(x_2)
+
+            exe = paddle.static.Executor(place=place)
+            res_1 = exe.run(fluid.default_main_program(),
+                            feed={"x": self.x_np},
+                            fetch_list=out_1,
+                            use_prune=True)
+            res_2 = exe.run(fluid.default_main_program(),
+                            feed={"x2": self.x_np},
+                            fetch_list=out_2,
+                            use_prune=True)
+
+            self.assertTrue(
+                check_output(self.x_np, res_1[0], self.lower_0, self.upper_0))
+            self.assertTrue(
+                check_output(self.x_np, res_2[0], self.lower_1, self.upper_1))
+
+    def dygraph_check(self, lower, upper):
+        for place in self.places:
+            paddle.disable_static(place)
+            x = paddle.to_tensor(self.x_np)
+            out = F.rrelu(x, lower, upper, training=False)
+            out_ref = ref_rrelu(self.x_np, lower, upper)
+            self.assertEqual(np.allclose(out_ref, out), True)
+            paddle.enable_static()
+
+    def test_dygraph_functional(self):
+        '''test_dygraph_functional'''
+
+        self.dygraph_check(self.lower_0, self.upper_0)
+        self.dygraph_check(self.lower_1, self.upper_1)
+
+    def test_dygraph_layer(self):
+        '''test_dygraph_layer'''
+
+        for place in self.places:
+            paddle.disable_static(place=place)
+            rrelu = paddle.nn.RReLU(self.lower_0, self.upper_0)
+            result = rrelu(paddle.to_tensor(self.x_np))
+            self.assertTrue(
+                check_output(self.x_np,
+                             result.numpy(), self.lower_0, self.upper_0))
+            paddle.enable_static()
+
+    def test_dygraph(self):
+        for place in self.places:
+            paddle.disable_static(place=place)
+            with dygraph.guard():
+                rrelu = paddle.nn.RReLU(self.lower_0, self.upper_0)
+                out_np = rrelu(paddle.to_tensor(self.x_np))
+            self.assertTrue(
+                check_output(self.x_np,
+                             out_np.numpy(), self.lower_0, self.upper_0))
+            paddle.enable_static()
+
+    def test_error_functional(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            # The input type must be Variable.
+            self.assertRaises(
+                TypeError, F.rrelu, x=1, lower=self.lower_0, upper=self.upper_0)
+            # The input dtype must be float16, float32, float64.
+            x_int32 = paddle.fluid.data(
+                name='x_int32', shape=[2, 3], dtype='int32')
+            self.assertRaises(
+                TypeError,
+                F.rrelu,
+                x=x_int32,
+                lower=self.lower_0,
+                upper=self.upper_0)
+            x_bool = paddle.fluid.data(
+                name='x_bool', shape=[2, 3], dtype='int32')
+            self.assertRaises(
+                TypeError,
+                F.rrelu,
+                x=x_bool,
+                lower=self.lower_0,
+                upper=self.upper_0)
+            # lower and upper must be float
+            x_fp32 = paddle.fluid.data(
+                name='x_fp32', shape=[2, 3], dtype='float32')
+            self.assertRaises(TypeError, F.rrelu, x=x_fp32, lower=0, upper=0.5)
+            self.assertRaises(TypeError, F.rrelu, x=x_fp32, lower=0.5, upper=1)
+            # lower and upper must be in (0, 1)
+            self.assertRaises(
+                ValueError, F.rrelu, x=x_fp32, lower=-1., upper=0.5)
+            self.assertRaises(
+                ValueError, F.rrelu, x=x_fp32, lower=0.5, upper=2.)
+            # upper should not be less than lower
+            self.assertRaises(
+                ValueError, F.rrelu, x=x_fp32, lower=0.5, upper=0.2)
+            # support the input dtype is float16
+            x_fp16 = paddle.fluid.data(
+                name='x_fp16', shape=[2, 3], dtype='float16')
+            F.rrelu(x=x_fp16, lower=self.lower_0, upper=self.upper_0)
+
+    def test_error_layer(self):
+        def error_int_dtype():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 3]).astype("float64")
+                rrelu = paddle.nn.RReLU(2, 3)
+                rrelu(paddle.to_tensor(x))
+
+        def error_lower_dtype():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 3]).astype("float32")
+                rrelu = paddle.nn.RReLU(0, 0.5)
+                rrelu(paddle.to_tensor(x))
+
+        def error_upper_dtype():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 3]).astype("float32")
+                rrelu = paddle.nn.RReLU(0.5, 1)
+                rrelu(paddle.to_tensor(x))
+
+        def error_lower_range():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 3]).astype("float32")
+                rrelu = paddle.nn.RReLU(-1.0, 0.5)
+                rrelu(paddle.to_tensor(x))
+
+        def error_upper_range():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 3]).astype("float32")
+                rrelu = paddle.nn.RReLU(0.5, 2.0)
+                rrelu(paddle.to_tensor(x))
+
+        def error_lower_upper():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 3]).astype("float32")
+                rrelu = paddle.nn.RReLU(0.5, 0.2)
+                rrelu(paddle.to_tensor(x))
+
+        self.assertRaises(TypeError, error_int_dtype)
+        self.assertRaises(TypeError, error_lower_dtype)
+        self.assertRaises(TypeError, error_upper_dtype)
+        self.assertRaises(ValueError, error_lower_range)
+        self.assertRaises(ValueError, error_upper_range)
+        self.assertRaises(ValueError, error_lower_upper)
+
+
+class RReluTest(OpTest):
+    def setUp(self):
+        self.op_type = "rrelu"
+        self.lower = 0.1
+        self.upper = 0.3
+        self.is_test = True
+        self.init_prams()
+
+    def init_prams(self):
+        self.dtype = "float64"
+        self.x_shape = [2, 3, 4, 5]
+
+        x_np = np.random.uniform(-1, 1, self.x_shape).astype(self.dtype)
+        out_np = ref_rrelu(x_np, self.lower, self.upper)
+        noise_np = np.ones(self.x_shape).astype(self.dtype)
+        noise_np[x_np < 0] = (self.lower + self.upper) / 2.0
+
+        self.inputs = {'X': x_np}
+        self.outputs = {'Out': out_np, 'Noise': noise_np}
+        self.attrs = {
+            'lower': self.lower,
+            "upper": self.upper,
+            "is_test": self.is_test
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class RReluTrainingTest(OpTest):
+    def setUp(self):
+        self.op_type = "rrelu"
+        self.lower = 0.3
+        self.upper = 0.3000009
+        self.is_test = False
+        self.init_prams()
+
+
+class RReluTrainingTest(OpTest):
+    def setUp(self):
+        self.op_type = "rrelu"
+        self.lower = 0.3
+        self.upper = 0.3000009
+        self.is_test = False
+        self.init_prams()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_run.py b/python/paddle/fluid/tests/unittests/test_run.py
index 28bcc379fb9a0..c0157c5b9068c 100644
--- a/python/paddle/fluid/tests/unittests/test_run.py
+++ b/python/paddle/fluid/tests/unittests/test_run.py
@@ -95,7 +95,7 @@ def test_collective_3(self):
             shutil.rmtree('./log')
 
         port = random.randrange(6000, 8000)
-        args = "--job_id test3 --devices 0,1 --master 127.0.0.1:{} --np 2".format(
+        args = "--job_id test3 --devices 0,1 --master 127.0.0.1:{} --nnodes 2".format(
             port)
         p1 = self.pdrun(args)
         p2 = self.pdrun(args)
@@ -143,7 +143,7 @@ def test_ps_3(self):
             shutil.rmtree('./log')
 
         port = random.randrange(6000, 8000)
-        args = "--job_id ps3 --master 127.0.0.1:{} --np 2 --server_num=1 --trainer_num=1".format(
+        args = "--job_id ps3 --master 127.0.0.1:{} --nnodes 2 --server_num=1 --trainer_num=1".format(
             port)
         p1 = self.pdrun(args)
         p2 = self.pdrun(args)
diff --git a/python/paddle/fluid/tests/unittests/test_tensordot.py b/python/paddle/fluid/tests/unittests/test_tensordot.py
index 9ac016511c20d..04b140cba4c0e 100644
--- a/python/paddle/fluid/tests/unittests/test_tensordot.py
+++ b/python/paddle/fluid/tests/unittests/test_tensordot.py
@@ -89,65 +89,6 @@ def set_input_data(self):
         self.x = np.random.random(self.x_shape).astype(self.dtype)
         self.y = np.random.random(self.y_shape).astype(self.dtype)
 
-    def set_test_axes(self):
-        self.all_axes = []
-        axial_index = range(4)
-        all_permutations = list(it.permutations(axial_index, 0)) + list(
-            it.permutations(axial_index, 1)) + list(
-                it.permutations(axial_index, 2)) + list(
-                    it.permutations(axial_index, 3)) + list(
-                        it.permutations(axial_index, 4))
-        self.all_axes.extend(list(i) for i in all_permutations)
-
-        for axes_x in all_permutations:
-            for axes_y in all_permutations:
-                if len(axes_x) < len(axes_y):
-                    supplementary_axes_x = axes_x + axes_y[len(axes_x):]
-                    if any(
-                            supplementary_axes_x.count(i) > 1
-                            for i in supplementary_axes_x):
-                        continue
-                elif len(axes_y) < len(axes_x):
-                    supplementary_axes_y = axes_y + axes_x[len(axes_y):]
-                    if any(
-                            supplementary_axes_y.count(i) > 1
-                            for i in supplementary_axes_y):
-                        continue
-                self.all_axes.append([list(axes_x), list(axes_y)])
-
-        self.all_axes.extend(range(5))
-
-    def test_dygraph(self):
-        paddle.disable_static()
-        for axes in self.all_axes:
-            for place in self.places:
-                x = paddle.to_tensor(self.x, place=place)
-                y = paddle.to_tensor(self.y, place=place)
-                paddle_res = paddle.tensordot(x, y, axes)
-                np_res = tensordot_np(self.x, self.y, axes)
-                np.testing.assert_allclose(paddle_res, np_res, rtol=1e-6)
-
-    def test_static(self):
-        paddle.enable_static()
-        for axes in self.all_axes:
-            for place in self.places:
-                with paddle.static.program_guard(paddle.static.Program(),
-                                                 paddle.static.Program()):
-                    x = paddle.static.data(
-                        name='x', shape=self.x_shape, dtype=self.dtype)
-                    y = paddle.static.data(
-                        name='y', shape=self.y_shape, dtype=self.dtype)
-                    z = paddle.tensordot(x, y, axes)
-                    exe = paddle.static.Executor(place)
-                    paddle_res = exe.run(feed={'x': self.x,
-                                               'y': self.y},
-                                         fetch_list=[z])
-                    np_res = tensordot_np(self.x, self.y, axes)
-                    np.testing.assert_allclose(paddle_res[0], np_res, rtol=1e-6)
-
-
-class TestTensordotAPIFloat64(TestTensordotAPI):
-    # Only test a small part of axes case for Float64 type
     def set_test_axes(self):
         self.all_axes = [
             [[3, 2], [3]], [[2, 1, 0], [2, 1]], [[1, 2, 0], [1, 3, 2]], [3, 0],
@@ -194,35 +135,65 @@ def set_test_axes(self):
             [[2, 0, 1], [0, 1, 3]], [[2, 1], [0, 1, 3]]
         ]
 
+    def test_dygraph(self):
+        paddle.disable_static()
+        for axes in self.all_axes:
+            for place in self.places:
+                x = paddle.to_tensor(self.x, place=place)
+                y = paddle.to_tensor(self.y, place=place)
+                paddle_res = paddle.tensordot(x, y, axes)
+                np_res = tensordot_np(self.x, self.y, axes)
+                np.testing.assert_allclose(paddle_res, np_res, rtol=1e-6)
+
+    def test_static(self):
+        paddle.enable_static()
+        for axes in self.all_axes:
+            for place in self.places:
+                with paddle.static.program_guard(paddle.static.Program(),
+                                                 paddle.static.Program()):
+                    x = paddle.static.data(
+                        name='x', shape=self.x_shape, dtype=self.dtype)
+                    y = paddle.static.data(
+                        name='y', shape=self.y_shape, dtype=self.dtype)
+                    z = paddle.tensordot(x, y, axes)
+                    exe = paddle.static.Executor(place)
+                    paddle_res = exe.run(feed={'x': self.x,
+                                               'y': self.y},
+                                         fetch_list=[z])
+                    np_res = tensordot_np(self.x, self.y, axes)
+                    np.testing.assert_allclose(paddle_res[0], np_res, rtol=1e-6)
+
+
+class TestTensordotAPIFloat64(TestTensordotAPI):
     def set_dtype(self):
         self.dtype = np.float64
 
 
-class TestTensordotAPIBroadcastCase1(TestTensordotAPIFloat64):
+class TestTensordotAPIBroadcastCase1(TestTensordotAPI):
     def set_input_shape(self):
         self.x_shape = [1, 1, 1, 5]
         self.y_shape = [1, 5, 1, 1]
 
 
-class TestTensordotAPIBroadcastCase2(TestTensordotAPIFloat64):
+class TestTensordotAPIBroadcastCase2(TestTensordotAPI):
     def set_input_shape(self):
         self.x_shape = [1, 5, 5, 5]
         self.y_shape = [1, 1, 1, 5]
 
 
-class TestTensordotAPIBroadcastCase3(TestTensordotAPIFloat64):
+class TestTensordotAPIBroadcastCase3(TestTensordotAPI):
     def set_input_shape(self):
         self.x_shape = [5, 5, 5, 1]
         self.y_shape = [5, 5, 1, 5]
 
 
-class TestTensordotAPIBroadcastCase4(TestTensordotAPIFloat64):
+class TestTensordotAPIBroadcastCase4(TestTensordotAPI):
     def set_input_shape(self):
         self.x_shape = [5, 5, 5, 1]
         self.y_shape = [1, 1, 1, 1]
 
 
-class TestTensordotAPIBroadcastCase5(TestTensordotAPIFloat64):
+class TestTensordotAPIBroadcastCase5(TestTensordotAPI):
     def set_input_shape(self):
         self.x_shape = [1, 1, 5, 5]
         self.y_shape = [5, 5, 1, 5]
diff --git a/python/paddle/fluid/tests/unittests/test_trunc_op.py b/python/paddle/fluid/tests/unittests/test_trunc_op.py
index 5bb3e99ee302f..1a6790728b137 100644
--- a/python/paddle/fluid/tests/unittests/test_trunc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_trunc_op.py
@@ -30,7 +30,7 @@ class TestTruncOp(OpTest):
     def setUp(self):
         self.op_type = "trunc"
         self.python_api = paddle.trunc
-        self.dtype = np.float64
+        self.init_dtype_type()
         np.random.seed(2021)
         self.inputs = {'X': np.random.random((20, 20)).astype(self.dtype)}
         self.outputs = {'Out': (np.trunc(self.inputs['X']))}
@@ -48,11 +48,19 @@ def test_check_grad(self):
 class TestFloatTruncOp(TestTruncOp):
     def init_dtype_type(self):
         self.dtype = np.float32
+        self.__class__.exist_fp64_check_grad = True
+
+    def test_check_grad(self):
+        pass
 
 
 class TestIntTruncOp(TestTruncOp):
     def init_dtype_type(self):
         self.dtype = np.int32
+        self.__class__.exist_fp64_check_grad = True
+
+    def test_check_grad(self):
+        pass
 
 
 class TestTruncAPI(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_bf16_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_bf16_op.py
index 2ba808a341e5e..5f4989f6c5dbd 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_bf16_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_bf16_op.py
@@ -178,7 +178,6 @@ class TestUniformRandomOpAPISeed(unittest.TestCase):
     def test_attr_tensor_API(self):
         _seed = 10
         gen = paddle.seed(_seed)
-        gen._is_init_py = False
         startup_program = fluid.Program()
         train_program = fluid.Program()
         with fluid.program_guard(train_program, startup_program):
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
index 0b27c61623089..0bca3c08f3d78 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
@@ -370,7 +370,6 @@ class TestUniformRandomOp_API_seed(unittest.TestCase):
     def test_attr_tensor_API(self):
         _seed = 10
         gen = paddle.seed(_seed)
-        gen._is_init_py = False
         startup_program = fluid.Program()
         train_program = fluid.Program()
         with fluid.program_guard(train_program, startup_program):
diff --git a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
index 23bbc377cae27..ea3264ba0dbb7 100644
--- a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
@@ -35,4 +35,5 @@
     'eigh',
     'eigvalsh',
     'class_center_sample',
+    'einsum',
 ]
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index ffd1607fe87b4..a3584a73dfae1 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -55,5 +55,6 @@
 
 from ..fluid.layer_helper import LayerHelper  # noqa: F401
 from ..fluid.framework import in_dygraph_mode  # noqa: F401
+from ..fluid.framework import _in_legacy_dygraph  # noqa: F401
 
 __all__ = []
diff --git a/python/paddle/framework/random.py b/python/paddle/framework/random.py
index 147f6be39c5e0..b58d36b8e7d50 100644
--- a/python/paddle/framework/random.py
+++ b/python/paddle/framework/random.py
@@ -44,10 +44,8 @@ def seed(seed):
 
     if core.is_compiled_with_cuda():
         for i in range(core.get_cuda_device_count()):
-            core.default_cuda_generator(i)._is_init_py = True
             core.default_cuda_generator(i).manual_seed(seed)
 
-    core.default_cpu_generator()._is_init_py = True
     return core.default_cpu_generator().manual_seed(seed)
 
 
diff --git a/python/paddle/incubate/__init__.py b/python/paddle/incubate/__init__.py
index ff7a167f1a670..c354baf3b43b7 100644
--- a/python/paddle/incubate/__init__.py
+++ b/python/paddle/incubate/__init__.py
@@ -32,6 +32,7 @@
 import paddle.incubate.autotune
 
 from . import nn  #noqa: F401
+from . import asp  #noqa: F401
 
 __all__ = [
     'LookAhead',
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_1d.py b/python/paddle/incubate/asp/__init__.py
similarity index 51%
rename from python/paddle/fluid/tests/unittests/asp/test_asp_pruning_1d.py
rename to python/paddle/incubate/asp/__init__.py
index 7a3fa0244930c..59f794ef28aa4 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_1d.py
+++ b/python/paddle/incubate/asp/__init__.py
@@ -13,25 +13,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
-import unittest
-import paddle
-from paddle.static import sparsity
-from paddle.fluid.tests.unittests.asp.asp_pruning_base import TestASPHelperPruningBase
-
-paddle.enable_static()
-
-
-class TestASPHelperPruning1D(TestASPHelperPruningBase):
-    def test_1D_inference_pruning(self):
-        self.run_inference_pruning_test(
-            'mask_1d', paddle.fluid.contrib.sparsity.CheckMethod.CHECK_1D)
-
-    def test_1D_training_pruning(self):
-        self.run_training_pruning_test(
-            'mask_1d', paddle.fluid.contrib.sparsity.CheckMethod.CHECK_1D)
-
-
-if __name__ == '__main__':
-    unittest.main()
+from ...fluid.contrib.sparsity import calculate_density  #noqa: F401
+from ...fluid.contrib.sparsity import decorate  #noqa: F401
+from ...fluid.contrib.sparsity import prune_model  #noqa: F401
+from ...fluid.contrib.sparsity import set_excluded_layers  #noqa: F401
+from ...fluid.contrib.sparsity import reset_excluded_layers  #noqa: F401
+
+__all__ = [     #noqa
+    'calculate_density',
+    'decorate',
+    'prune_model',
+    'set_excluded_layers',
+    'reset_excluded_layers'
+]
diff --git a/python/paddle/incubate/autograd/primx.py b/python/paddle/incubate/autograd/primx.py
index 7a969748208a4..1f5c4f9a5cebb 100644
--- a/python/paddle/incubate/autograd/primx.py
+++ b/python/paddle/incubate/autograd/primx.py
@@ -38,8 +38,7 @@ def topo_path(xs, ys, block=None):
         path, the unused variables in `xs`, and the unreached variables in `ys`
     """
 
-    if block is None:
-        block = default_main_program().current_block()
+    block = default_main_program().current_block() if block is None else block
 
     path = []
     backpath = []
@@ -160,11 +159,14 @@ def contain_value(self, value_var):
         return id(value_var) in self.tab.values()
 
 
+# TODO(lml): supporting control flow, nested blocks, and block other than current block of main program.
 class Transform(object):
     """ An object that maintains the state of transformations applied to a 
     primitve program. """
 
     def __init__(self, block):
+        assert block == default_main_program().current_block(
+        ), f'only support transform on current block of main program.'
         self.block = block
         self.vars = self.init_vars(block)
         self.var2dot = VarMap('var2dot', self.vars)
@@ -400,6 +402,7 @@ def transpose(self, ys_dot, xs_dot, ys_bar=None, retain_fwd=False):
         return ys_bar, xs_bar
 
 
+# TODO(lml): supporting control flow, nested blocks, and block other than current block of main program.
 def _lower(block, reverse):
     # Some functions which are only used in _lower.
     def bind(args, to_bind, value_table):
@@ -430,10 +433,6 @@ def expand_nested_list(xs):
     # Step1: Do some preparatory work for lower
     lower_fn = _prim2orig if reverse else _orig2prim
     lookup_fn = lookup_prim2orig if reverse else lookup_orig2prim
-    if block is None:
-        program = default_main_program()
-        assert program.num_blocks == 1, "The lower transform is designed to process only one block."
-        block = program.current_block()
 
     value_table = {}
     to_bind = {}
@@ -516,6 +515,7 @@ def orig2prim(block=None):
     """ 
     .. note::
         **This API is ONLY available in the static mode.**
+        **Args block must be None or current block of main program.**
 
     All operators in the target block are processed as follows.
     If it is an original operator, it will be transformed into
@@ -523,13 +523,14 @@ def orig2prim(block=None):
     equivalent function.
     
     Args:
-        block(paddle.fluid.framework.Variable|None, optional): The
+        block(paddle.static.Block|None, optional): The
             target block to process on. Default None, and will
             process on the current block of main program.
-    
-    Returns:
-        None
     """
+
+    block = default_main_program().current_block() if block is None else block
+    assert block == default_main_program().current_block(
+    ), f'block is neither None nor current block of main program'
     _lower(block, reverse=False)
 
 
@@ -538,6 +539,7 @@ def prim2orig(block=None):
     """
     .. note::
         **ONLY available in the static mode.**
+        **Args block must be None or current block of main program.**
 
     All operators in the target block are processed as follows.
     If it is an automatic differential basic operator, it will be
@@ -545,10 +547,10 @@ def prim2orig(block=None):
     equivalent function to support execution.
     
     Args:
-        block(paddle.static.Variable|None, optional): The
+        block(paddle.static.Block|None, optional): The
             target block to process on. Default None, and will
             process on the current block of main program.
-       
+    
     Examples:
 
         .. code-block:: python
@@ -566,6 +568,10 @@ def prim2orig(block=None):
             if prim_enabled():
                 prim2orig()
     """
+
+    block = default_main_program().current_block() if block is None else block
+    assert block == default_main_program().current_block(
+    ), f'block is neither None nor current block of main program'
     _lower(block, reverse=True)
 
 
@@ -583,7 +589,9 @@ def _gradients(ys, xs, ys_bar=None):
     """
 
     ys, xs = to_tensors(ys), to_tensors(xs)
-    block = ys[0].block
+    block = default_main_program().current_block()
+    for el in xs + ys:
+        assert el is None or el.block == block, f'variable in xs and ys should be None or in current block of main program'
     # TODO(Tongxin) without any prior knowledge about whether the program
     # is completely lowered to primitive ops, it's mandatory to run the lowering
     # pass once and again. This is obviously inefficient and needs to be 
diff --git a/python/paddle/incubate/nn/__init__.py b/python/paddle/incubate/nn/__init__.py
index 43fcabf97317e..3c806aa646ebe 100644
--- a/python/paddle/incubate/nn/__init__.py
+++ b/python/paddle/incubate/nn/__init__.py
@@ -16,10 +16,12 @@
 from .layer.fused_transformer import FusedFeedForward  # noqa: F401
 from .layer.fused_transformer import FusedTransformerEncoderLayer  # noqa: F401
 from .layer.fused_transformer import FusedMultiTransformer  # noqa: F401
+from .layer.fused_transformer import FusedBiasDropoutResidualLayerNorm  # noqa: F401
 
 __all__ = [  #noqa
     'FusedMultiHeadAttention',
     'FusedFeedForward',
     'FusedTransformerEncoderLayer',
     'FusedMultiTransformer',
+    'FusedBiasDropoutResidualLayerNorm',
 ]
diff --git a/python/paddle/incubate/nn/functional/__init__.py b/python/paddle/incubate/nn/functional/__init__.py
index 4da090487785b..02e44548ce5d8 100644
--- a/python/paddle/incubate/nn/functional/__init__.py
+++ b/python/paddle/incubate/nn/functional/__init__.py
@@ -15,9 +15,11 @@
 from .fused_transformer import fused_multi_head_attention
 from .fused_transformer import fused_feedforward
 from .fused_transformer import fused_multi_transformer
+from .fused_transformer import fused_bias_dropout_residual_layer_norm
 
 __all__ = [
     'fused_multi_head_attention',
     'fused_feedforward',
     'fused_multi_transformer',
+    'fused_bias_dropout_residual_layer_norm',
 ]
diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py
index 3e263f1c6d3ae..232e16415a5f7 100644
--- a/python/paddle/incubate/nn/functional/fused_transformer.py
+++ b/python/paddle/incubate/nn/functional/fused_transformer.py
@@ -212,6 +212,151 @@ def fused_feedforward(x,
     return out
 
 
+def fused_bias_dropout_residual_layer_norm(x,
+                                           residual,
+                                           bias=None,
+                                           ln_scale=None,
+                                           ln_bias=None,
+                                           dropout_rate=0.5,
+                                           ln_epsilon=1e-5,
+                                           training=True,
+                                           mode='upscale_in_train',
+                                           name=None):
+    r"""
+    The fused_bias_dropout_residual_layer_norm operator. The pseudo code is as follows:
+
+    .. code-block:: python
+        y = layer_norm(residual + dropout(bias + x))
+
+    Parameters:
+        x (Tensor): The input tensor. The shape is `[*, embed\_dim]`.
+        residual (Tensor): The residual tensor. The shape is same as x.
+        bias (Tensor, optional): The bias of linear. The shape is `[embed_dim]`. Default None.
+        ln_scale (Tensor, optional): The weight tensor of layernorm. The shape is `[embed_dim]`. Default None.
+        ln_bias (Tensor, optional): The bias tensor of layernorm. The shape is `[embed_dim]`. Default None.
+        dropout_rate (float, optional): The dropout probability used on attention
+            weights to drop some attention targets for the dropout after attention.
+            0 for no dropout. Default 0.5.
+        ln_epsilon (float, optional): Small float value added to denominator of layer_norm
+            to avoid dividing by zero. Default is 1e-5.
+        training (bool, optional): A flag indicating whether it is in train phrase or not. Default True.
+        mode (str, optional): ['upscale_in_train'(default) | 'downscale_in_infer']
+
+                               1. upscale_in_train(default), upscale the output at training time
+
+                                  - train: out = input * mask / ( 1.0 - p )
+                                  - inference: out = input
+
+                               2. downscale_in_infer, downscale the output at inference
+
+                                  - train: out = input * mask
+                                  - inference: out = input * (1.0 - p)
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: The output Tensor, the data type and shape is same as `x`.
+
+    Examples:
+
+        .. code-block:: python
+
+            # required: gpu
+            import paddle
+            import paddle.incubate.nn.functional as F
+
+            # input: [batch_size, seq_len, embed_dim]
+            x = paddle.rand(shape=(2, 4, 128), dtype="float32")
+            # residual: [batch_size, seq_len, embed_dim]
+            residual = paddle.rand(shape=(2, 4, 128), dtype="float32")
+            # linear bias: [embed_dim]
+            bias = paddle.rand(shape=[128], dtype="float32")
+            # output: [batch_size, seq_len, embed_dim]
+            output = F.fused_bias_dropout_residual_layer_norm(
+                x, residual, bias)
+            # [2, 4, 128]
+            print(output.shape)
+    """
+    seed = None
+    if mode not in ('downscale_in_infer', 'upscale_in_train'):
+        raise ValueError(
+            "mode argument should be 'downscale_in_infer' or 'upscale_in_train'")
+    mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
+
+    if ln_scale is not None:
+        assert len(ln_scale.
+                   shape) == 1, "The dims of the shape of ln_scale should be 1."
+        assert x.shape[len(x.shape) - 1] == ln_scale.shape[
+            0], "The dim of ln_scale must equal to the last dim of x."
+    if ln_bias is not None:
+        assert len(
+            ln_bias.shape) == 1, "The dims of the shape of ln_bias should be 1."
+        assert x.shape[len(x.shape) - 1] == ln_bias.shape[
+            0], "The dim of ln_bias must equal to the last dim of x."
+
+    if _non_static_mode():
+        if default_main_program().random_seed != 0:
+            seed = default_main_program().random_seed
+        _, _, _, _, final_out = _C_ops.fused_bias_dropout_residual_layer_norm(
+            x, residual, bias, ln_scale, ln_bias, 'dropout_rate', dropout_rate,
+            'ln_epsilon', ln_epsilon, 'is_test', not training,
+            'dropout_fix_seed', seed is not None, 'dropout_seed', seed
+            if seed is not None else 0, 'dropout_implementation', mode)
+        return final_out
+    else:
+        helper = LayerHelper('fused_bias_dropout_residual_layer_norm',
+                             **locals())
+        dtype = x.dtype
+        # check dtypes
+        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                                 'fused_bias_dropout_residual_layer_norm')
+        check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'],
+                    'fused_bias_dropout_residual_layer_norm')
+        # set inputs
+        inputs = dict()
+        inputs['X'] = [x]
+        inputs['Residual'] = [residual]
+        if bias is not None:
+            inputs['Bias'] = [bias]
+        if ln_scale:
+            inputs['LnScale'] = [ln_scale]
+        if ln_bias:
+            inputs['LnBias'] = [ln_bias]
+        if (seed is None or seed == 0) and helper.main_program.random_seed != 0:
+            seed = helper.main_program.random_seed
+        # set attrs
+        attrs = {
+            'ln_epsilon': ln_epsilon,
+            'dropout_rate': dropout_rate,
+            'is_test': not training,
+            'dropout_fix_seed': seed is not None,
+            'dropout_seed': seed if seed is not None else 0,
+            'dropout_implementation': mode,
+        }
+        # set outputs
+        dropout_mask_out = helper.create_variable_for_type_inference(
+            dtype=core.VarDesc.VarType.UINT8, stop_gradient=True)
+        ln_mean_out = helper.create_variable_for_type_inference(
+            dtype=dtype, stop_gradient=True)
+        ln_variance_out = helper.create_variable_for_type_inference(
+            dtype=dtype, stop_gradient=True)
+        bias_dropout_residual_out = helper.create_variable_for_type_inference(
+            dtype=dtype)
+        final_out = helper.create_variable_for_type_inference(dtype=dtype)
+
+        helper.append_op(
+            type='fused_bias_dropout_residual_layer_norm',
+            inputs=inputs,
+            outputs={
+                "BiasDropoutResidualOut": bias_dropout_residual_out,
+                "DropoutMaskOut": dropout_mask_out,
+                "LnMean": ln_mean_out,
+                "LnVariance": ln_variance_out,
+                'Y': final_out,
+            },
+            attrs=attrs)
+        return final_out
+
+
 def fused_multi_head_attention(x,
                                qkv_weight,
                                linear_weight,
@@ -368,10 +513,9 @@ def fused_multi_head_attention(x,
             attn_mask, linear_weight, linear_bias, ln_scale, ln_bias,
             'pre_layer_norm', pre_layer_norm, 'epsilon', pre_ln_epsilon,
             'dropout_rate', dropout_rate, 'attn_dropout_rate',
-            attn_dropout_rate, 'ln_epsilon', ln_epsilon, 'attn_dropout_is_test',
-            not training, 'dropout_is_test', not training,
-            'attn_dropout_fix_seed', seed is not None, 'dropout_fix_seed',
-            seed is not None, 'attn_dropout_seed', seed
+            attn_dropout_rate, 'ln_epsilon', ln_epsilon, 'is_test',
+            not training, 'attn_dropout_fix_seed', seed is not None,
+            'dropout_fix_seed', seed is not None, 'attn_dropout_seed', seed
             if seed is not None else 0, 'dropout_seed', seed
             if seed is not None else 0, 'attn_dropout_implementation', mode,
             'dropout_implementation', mode, 'ring_id', ring_id)
@@ -417,8 +561,7 @@ def fused_multi_head_attention(x,
             'ln_epsilon': ln_epsilon,
             'dropout_rate': dropout_rate,
             'attn_dropout_rate': attn_dropout_rate,
-            'attn_dropout_is_test': not training,
-            'dropout_is_test': not training,
+            'is_test': not training,
             'attn_dropout_fix_seed': seed is not None,
             'dropout_fix_seed': seed is not None,
             'attn_dropout_seed': seed if seed is not None else 0,
@@ -656,7 +799,7 @@ def fused_multi_transformer(x,
             time_step, attn_mask, linear_weights, linear_biases, ffn_ln_scales,
             ffn_ln_biases, ffn1_weights, ffn1_biases, ffn2_weights, ffn2_biases,
             cache_kvs, 'pre_layer_norm', pre_layer_norm, 'epsilon', epsilon,
-            'dropout_rate', dropout_rate, 'dropout_is_test', not training,
+            'dropout_rate', dropout_rate, 'is_test', not training,
             'dropout_implementation', mode, 'act_method', activation, 'ring_id',
             ring_id)
         if cache_kvs is not None:
@@ -703,7 +846,7 @@ def fused_multi_transformer(x,
             'pre_layer_norm': pre_layer_norm,
             'epsilon': epsilon,
             'dropout_rate': dropout_rate,
-            'dropout_is_test': not training,
+            'is_test': not training,
             'dropout_implementation': mode,
             'act_method': activation,
             'ring_id': ring_id
diff --git a/python/paddle/incubate/nn/layer/fused_transformer.py b/python/paddle/incubate/nn/layer/fused_transformer.py
index 072c7d9fccade..a64b7e506021c 100644
--- a/python/paddle/incubate/nn/layer/fused_transformer.py
+++ b/python/paddle/incubate/nn/layer/fused_transformer.py
@@ -36,6 +36,103 @@ def _set_var_distributed(var):
     main_block._find_var_recursive(var.name).is_distributed = True
 
 
+class FusedBiasDropoutResidualLayerNorm(Layer):
+    """
+    Applies fused_bias_dropout_residual_layer_norm operation.
+
+    Parameters:
+        embed_dim (int): The expected feature size in the input and output.
+        dropout_rate (float, optional): The dropout probability used on attention
+            weights to drop some attention targets for the dropout after attention.
+            0 for no dropout. Default 0.5.
+        bias_attr (ParamAttr|bool, optional): To specify the bias parameter property.
+            Default: None, which means the default bias parameter property is used.
+            If it is set to False, this layer will not have trainable bias parameter.
+            See usage for details in :code:`ParamAttr`.
+        epsilon (float, optional): The small value added to the variance to prevent
+            division by zero. Default: 1e-05.
+
+    Examples:
+
+        .. code-block:: python
+
+            # required: gpu
+            import paddle
+            # input: [batch_size, seq_len, embed_dim]
+            x = paddle.rand((2, 4, 128))
+            # residual: [batch_size, seq_len, embed_dim]
+            residual = paddle.rand((2, 4, 128))
+            fused_bias_dropout_residual_ln = paddle.incubate.nn.FusedBiasDropoutResidualLayerNorm(128)
+            output = fused_bias_dropout_residual_ln(x, residual)  # [2, 4, 128]
+    """
+
+    def __init__(self,
+                 embed_dim,
+                 dropout_rate=0.5,
+                 weight_attr=None,
+                 bias_attr=None,
+                 epsilon=1e-5,
+                 name=None):
+        super(FusedBiasDropoutResidualLayerNorm, self).__init__()
+        assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
+                               "but recieved {}".format(embed_dim))
+        self._dtype = self._helper.get_default_dtype()
+        self._bias_attr = bias_attr
+        self._weight_attr = weight_attr
+        self.embed_dim = embed_dim
+        self.linear_bias = self.create_parameter(
+            shape=[embed_dim],
+            attr=self._bias_attr,
+            dtype=self._dtype,
+            is_bias=True)
+        self.ln_scale = self.create_parameter(
+            attr=self._weight_attr,
+            shape=[embed_dim],
+            default_initializer=Constant(value=1.0))
+        self.ln_bias = self.create_parameter(
+            attr=self._bias_attr, shape=[embed_dim], is_bias=True)
+        self.dropout_rate = dropout_rate
+        self._epsilon = epsilon
+
+        self.name = name
+
+    def forward(self, x, residual):
+        """
+        Applies fused_bias_dropout_residual_layer_norm operation.
+
+        Parameters:
+            x (Tensor): The input tensor. It is a tensor with shape 
+                `[batch_size, seq_len, embed_dim]`. The data type should be 
+                float32 or float64. 
+            residual (Tensor, optional): The residual tensor. It is a tensor 
+                with shape `[batch_size, value_length, vdim]`. The data type 
+                should be float32 or float64. 
+
+        Returns:
+            Tensor|tuple: It is a tensor that has the same shape and data type \
+                as `x`.
+        """
+
+        out = incubate_f.fused_bias_dropout_residual_layer_norm(
+            x=x,
+            residual=residual,
+            bias=self.linear_bias,
+            ln_scale=self.ln_scale,
+            ln_bias=self.ln_bias,
+            dropout_rate=self.dropout_rate,
+            ln_epsilon=self._epsilon,
+            training=self.training,
+            mode='upscale_in_train',
+            name=self.name)
+        return out
+
+    def extra_repr(self):
+        name_str = ', name={}'.format(self.name) if self.name else ''
+        return 'embed_dim={}, seq_len={}, dropout_rate={}, epsilon={}, dtype={}{}'.format(
+            self.embed_dim, self.seq_len, self.dropout_rate, self._epsilon,
+            self._dtype, name_str)
+
+
 class FusedMultiHeadAttention(Layer):
     """
     Attention mapps queries and a set of key-value pairs to outputs, and
diff --git a/python/paddle/metric/metrics.py b/python/paddle/metric/metrics.py
index 118004088da16..d399cb2052498 100644
--- a/python/paddle/metric/metrics.py
+++ b/python/paddle/metric/metrics.py
@@ -771,7 +771,7 @@ def accuracy(input, label, k=1, correct=None, total=None, name=None):
     Args:
         input(Tensor): The input of accuracy layer, which is the predictions of network. A Tensor with type float32,float64.
             The shape is ``[sample_number, class_dim]`` .
-        label(Tensor): The label of dataset. Tensor with type int32,int64. The shape is ``[sample_number, 1]`` .
+        label(Tensor): The label of dataset. Tensor with type int64. The shape is ``[sample_number, 1]`` .
         k(int, optional): The top k predictions for each class will be checked. Data type is int64 or int32.
         correct(Tensor, optional): The correct predictions count. A Tensor with type int64 or int32.
         total(Tensor, optional): The total entries count. A tensor with type int64 or int32.
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index bceee4b964a33..b4be291b0697f 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -51,6 +51,7 @@
 from .layer.activation import ThresholdedReLU  # noqa: F401
 from .layer.activation import LogSoftmax  # noqa: F401
 from .layer.activation import Maxout  # noqa: F401
+from .layer.activation import RReLU  # noqa: F401
 from .layer.common import Pad1D  # noqa: F401
 from .layer.common import Pad2D  # noqa: F401
 from .layer.common import ZeroPad2D  # noqa: F401
@@ -313,4 +314,5 @@ def weight_norm(*args):
            'MaxUnPool3D',
            'HingeEmbeddingLoss',
            'Identity',
+           'RReLU',
 ]
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 68213d831c550..fa5a56c468620 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -47,6 +47,7 @@
 from .activation import log_softmax  # noqa: F401
 from .activation import glu  # noqa: F401
 from .activation import gumbel_softmax  # noqa: F401
+from .activation import rrelu  # noqa: F401
 from .common import dropout  # noqa: F401
 from .common import dropout2d  # noqa: F401
 from .common import dropout3d  # noqa: F401
@@ -118,8 +119,8 @@
 from .vision import channel_shuffle  # noqa: F401
 from .input import one_hot  # noqa: F401
 from .input import embedding  # noqa: F401
-from ...fluid.layers import gather_tree  # noqa: F401
-from ...fluid.layers import temporal_shift  # noqa: F401
+from .extension import gather_tree  # noqa: F401
+from .extension import temporal_shift  # noqa: F401
 
 from .sparse_attention import sparse_attention
 
@@ -228,4 +229,5 @@
            'class_center_sample',
            'sparse_attention',
            'fold',
+           'rrelu',
 ]
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index e64efda7b33bf..dd314868b69e2 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ...fluid.layers import sigmoid  # noqa: F401
+from ...tensor.ops import sigmoid  # noqa: F401
 from ...tensor.math import tanh  # noqa: F401
 from ...tensor.math import tanh_  # noqa: F401
 
@@ -63,8 +63,10 @@ def celu(x, alpha=1.0, name=None):
     if alpha == 0:
         raise ZeroDivisionError("alpha cannot be 0 for celu")
 
-    if in_dynamic_mode():
+    if _in_legacy_dygraph():
         return _C_ops.celu(x, 'alpha', alpha)
+    if in_dygraph_mode():
+        return _C_ops.final_state_celu(x, alpha)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'celu')
     helper = LayerHelper("celu", **locals())
@@ -548,6 +550,122 @@ def prelu(x, weight, data_format="NCHW", name=None):
     return out
 
 
+def rrelu(x, lower=1. / 8., upper=1. / 3., training=True, name=None):
+    r"""
+    rrelu activation.
+
+    Applies the randomized leaky rectified liner unit function to improve generalization performance,
+    as described in the paper:
+    `Empirical Evaluation of Rectified Activations in Convolutional Network <https://arxiv.org/abs/1505.00853>`_
+
+    During training, randomly samples the negative slope for activation values as described below:
+
+    .. math::
+
+        rrelu(x)=
+            \left\{
+                \begin{array}{rcl}
+                    x, & & if \ x >= 0 \\
+                    a * x, & & otherwise \\
+                \end{array}
+            \right.
+
+    where :math:`x` is the input tensor,
+    :math:`a` is randomly sampled from uniform distribution in range (:math:`lower`, :math:`upper`),
+
+    In the test phase, the negative slope will take the average value of :math:`lower` and :math:`upper`:
+
+    .. math::
+
+        rrelu(x)=
+            \left\{
+                \begin{array}{rcl}
+                    x, & & if \ x >= 0 \\
+                    (lower + upper) * 0.5 * x, & & otherwise \\
+                \end{array}
+            \right.
+
+    where :math:`x` is the input tensor,
+    :math:`lower` and :math:`upper` are the bounds of uniform distribution.
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float16, float32, float64.
+        lower (float, optional): The lower bound of uniform distribution. Default: 0.125.
+        upper (float, optional): The upper bound of uniform distribution. Default: 0.333.
+        training (bool, optional): Current mode is in training or others.  Default is True.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+            :name: rrelu-example
+
+            import paddle
+            import paddle.nn.functional as F
+
+            input_tensor = paddle.to_tensor([[[[-2.0,  3.0, -4.0,  5.0],
+                                            [ 3.0, -4.0,  5.0, -6.0],
+                                            [-7.0, -8.0,  8.0,  9.0]],
+                                            [[ 1.0, -2.0, -3.0,  4.0],
+                                            [-5.0,  6.0,  7.0, -8.0],
+                                            [ 6.0,  7.0,  8.0,  9.0]]]], dtype='float32')
+
+            out = F.rrelu(input_tensor, 0.1, 0.3)
+            #[[[[-0.20000899  3.         -0.8810822   5.        ]
+            #   [ 3.         -0.55175185  5.         -1.0776101 ]
+            #   [-1.0680687  -1.9896201   8.          9.        ]]
+            #  [[ 1.         -0.5238267  -0.65515125  4.        ]
+            #   [-1.3766339   6.          7.         -2.3465784 ]
+            #   [ 6.          7.          8.          9.        ]]]]
+    """
+
+    if not in_dynamic_mode():
+        check_variable_and_dtype(x, 'X', ['float16', 'float32', 'float64'],
+                                 'rrelu')
+
+    if not isinstance(lower, float) or not isinstance(upper, float):
+        raise TypeError(
+            "The lower and upper values must be float type. Received: lower {}, upper {}.".
+            format(lower, upper))
+
+    if lower < 0 or lower > 1:
+        raise ValueError(
+            "The lower value must be no less than zero or greater than one. Received: {}.".
+            format(lower))
+
+    if upper < lower:
+        raise ValueError(
+            "The upper value must be greater than lower value. Received: lower {}, upper {}.".
+            format(lower, upper))
+
+    if upper > 1:
+        raise ValueError(
+            "The upper value must be no greater than one. Received: {}.".format(
+                upper))
+
+    is_test = not training
+
+    if _in_legacy_dygraph():
+        out, noise = _C_ops.rrelu(x, 'lower', lower, 'upper', upper, 'is_test',
+                                  is_test)
+        return out
+
+    helper = LayerHelper('rrelu', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    noise = helper.create_variable_for_type_inference(dtype=x.dtype)
+    attrs = {'lower': lower, 'upper': upper, 'is_test': is_test}
+    helper.append_op(
+        type='rrelu',
+        inputs={"X": x},
+        outputs={"Out": out,
+                 "Noise": noise},
+        attrs=attrs)
+    return out
+
+
 def relu(x, name=None):
     """
     relu activation.
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index fe37b8fb97c3d..7fed1dbb487fa 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -21,7 +21,6 @@
 from paddle.static import Variable
 from ...fluid import dygraph_utils
 # TODO: define the common functions to build a neural network  
-from ...fluid.layers import unfold  # noqa: F401
 from ...tensor.manipulation import squeeze
 from ...tensor.manipulation import unsqueeze
 from ...tensor import clip
@@ -31,8 +30,6 @@
 from ...fluid.framework import _varbase_creator, _in_legacy_dygraph, in_dygraph_mode, _non_static_mode
 
 from ...fluid import dygraph_utils
-from ...fluid import layers
-from ...fluid.data_feeder import check_variable_and_dtype
 
 from paddle import _C_ops
 from paddle.framework import in_dynamic_mode
@@ -44,6 +41,135 @@
 __all__ = []
 
 
+def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
+    r"""
+
+    This op returns a col buffer of sliding local blocks of input x, also known
+    as im2col for batched 2D image tensors. For each block under the convolution filter,
+    all element will be rearranged as a column. While the convolution filter sliding over
+    the input feature map, a series of such columns will be formed.
+
+    For each input :math:`x` with shape [N, C, H, W], the output shape [N, Cout, Lout]
+    can be calculated as following.
+
+    .. math::
+
+        dkernel[0] &= dilations[0] \times (kernel\_sizes[0] - 1) + 1
+
+        dkernel[1] &= dilations[1] \times (kernel\_sizes[1] - 1) + 1
+
+        hout &= \frac{H + paddings[0] + paddings[2] - dkernel[0]}{strides[0]} + 1
+
+        wout &= \frac{W + paddings[1] + paddings[3] - dkernel[1]}{strides[1]} + 1
+
+        Cout &= C \times kernel\_sizes[0] \times kernel\_sizes[1]
+
+        Lout &= hout \times wout
+
+
+    Parameters:
+        x(Tensor):              4-D Tensor, input tensor of format [N, C, H, W],
+                                  data type can be float32 or float64
+        kernel_sizes(int|list):   The size of convolution kernel, should be [k_h, k_w]
+                                  or an integer k treated as [k, k].
+        strides(int|list):        The strides, should be [stride_h, stride_w]
+                                  or an integer stride treated as [sride, stride].
+                                  For default, strides will be [1, 1].
+        paddings(int|list):       The paddings of each dimension, should be
+                                  [padding_top, padding_left, padding_bottom, padding_right]
+                                  or [padding_h, padding_w] or an integer padding.
+                                  If [padding_h, padding_w] was given, it will expanded to
+                                  [padding_h, padding_w, padding_h, padding_w]. If an integer
+                                  padding was given, [padding, padding, padding, padding] will
+                                  be used. For default, paddings will be [0, 0, 0, 0]
+        dilations(int|list):      the dilations of convolution kernel, should be
+                                  [dilation_h, dilation_w], or an integer dilation treated as
+                                  [dilation, dilation]. For default, it will be [1, 1].
+        name(str, optional): The default value is None.
+                             Normally there is no need for user to set this property.
+                             For more information, please refer to :ref:`api_guide_Name`
+
+
+    Returns:
+        The tensor corresponding to the sliding local blocks.
+        The output shape is [N, Cout, Lout] as decriabled above.
+        Cout is the  total number of values within each block,
+        and Lout is the total number of such blocks.
+        The data type of output is the same as the input :math:`x`
+
+    Return Type:
+        Tensor
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+
+            x = paddle.randn((100,3,224,224))
+            y = F.unfold(x, [3, 3], 1, 1, 1)
+    """
+
+    helper = LayerHelper("unfold", **locals())
+
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'unfold')
+
+    assert len(x.shape) == 4, \
+            "input should be the format of [N, C, H, W]"
+
+    if isinstance(kernel_sizes, int):
+        kernel_sizes = [kernel_sizes, kernel_sizes]
+    else:
+        assert isinstance(kernel_sizes, list) and (len(kernel_sizes) == 2), \
+            "kernel_sizes should either be an integer or a list of two integers"
+
+    if isinstance(strides, int):
+        strides = [strides, strides]
+    else:
+        assert isinstance(strides, list) and (len(strides) == 2), \
+            "strides should either be an integer or a list of two integers"
+
+    if isinstance(dilations, int):
+        dilations = [dilations, dilations]
+    else:
+        assert isinstance(dilations, list) and (len(dilations) == 2), \
+            "dilations should either be an integer or a list of two integers"
+
+    if isinstance(paddings, int):
+        paddings = [paddings] * 4
+    elif isinstance(paddings, list):
+        if len(paddings) == 2:
+            paddings = paddings * 2
+        elif len(paddings) == 4:
+            pass
+        else:
+            raise ValueError(
+                "paddings should either be an integer or a list of 2 or 4 integers"
+            )
+    else:
+        raise ValueError(
+            "Unexpected type of paddings, it should be either an integer or a list"
+            "of 2 or 4 integers")
+
+    if in_dygraph_mode():
+        return _C_ops.final_state_unfold(x, kernel_sizes, strides, paddings,
+                                         dilations)
+
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type="unfold",
+        inputs={"X": x},
+        outputs={"Y": out},
+        attrs={
+            "kernel_sizes": kernel_sizes,
+            "strides": strides,
+            "paddings": paddings,
+            "dilations": dilations
+        })
+    return out
+
+
 def interpolate(x,
                 size=None,
                 scale_factor=None,
@@ -1295,7 +1421,23 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
 
     if mode == "constant" and isinstance(pad, (
             list, tuple)) and len(pad) == x_dim * 2:
-        return layers.pad(x, pad, pad_value=value)
+        paddings = pad
+        pad_value = value
+        check_variable_and_dtype(x, 'x', [
+            'float16', 'float32', 'float64', 'int32', 'int64', 'complex64',
+            'complex128'
+        ], "pad")
+
+        helper = LayerHelper('pad', **locals())
+        dtype = helper.input_dtype(input_param_name='x')
+        out = helper.create_variable_for_type_inference(dtype)
+        helper.append_op(
+            type='pad',
+            inputs={'X': x},
+            outputs={'Out': out},
+            attrs={'paddings': paddings,
+                   'pad_value': float(pad_value)})
+        return out
 
     assert x_dim in [
         3, 4, 5
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 84aadbbac649b..419014daf64e4 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -126,6 +126,23 @@ def _conv_nd(x,
         pre_bias = _C_ops.final_state_conv2d(
             x, weight, stride, padding, padding_algorithm, groups, dilation,
             data_format, False, -1, False)
+        if bias is not None:
+            channel_dim = channel_dim + len(
+                x.shape) if channel_dim < 0 else channel_dim
+            if len(bias.shape) < len(x.shape):
+                tmp_bias = _C_ops.final_state_reshape(
+                    bias, bias.shape +
+                    [1 for i in range(len(x.shape) - channel_dim - 1)])
+                return _C_ops.final_state_add(pre_bias, tmp_bias)
+            else:
+                return _C_ops.final_state_add(pre_bias, bias)
+        else:
+            return pre_bias
+
+    if in_dygraph_mode() and op_type == "depthwise_conv2d":
+        pre_bias = _C_ops.final_state_depthwise_conv2d(
+            x, weight, stride, padding, padding_algorithm, groups, dilation,
+            data_format, False, -1, False, False)
         if bias is not None:
             channel_dim = channel_dim + len(
                 x.shape) if channel_dim < 0 else channel_dim
@@ -135,6 +152,21 @@ def _conv_nd(x,
             return _C_ops.final_state_add(pre_bias, tmp_bias)
         else:
             return pre_bias
+
+    if in_dygraph_mode() and op_type == "conv3d":
+        pre_bias = _C_ops.final_state_conv3d(
+            x, weight, stride, padding, padding_algorithm, groups, dilation,
+            data_format, False, -1, False)
+        if bias is not None:
+            channel_dim = channel_dim + len(
+                x.shape) if channel_dim < 0 else channel_dim
+            tmp_bias = _C_ops.final_state_reshape(
+                bias, bias.shape +
+                [1 for i in range(len(x.shape) - channel_dim - 1)])
+            return _C_ops.final_state_add(pre_bias, tmp_bias)
+        else:
+            return pre_bias
+
     if in_dynamic_mode():
         attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation,
                  'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn',
diff --git a/python/paddle/nn/functional/extension.py b/python/paddle/nn/functional/extension.py
index 2483eab6c053a..5a6bf4c0fa650 100644
--- a/python/paddle/nn/functional/extension.py
+++ b/python/paddle/nn/functional/extension.py
@@ -21,8 +21,12 @@
 from ...tensor.creation import assign
 from ...fluid import dygraph_utils
 from ...tensor.layer_function_generator import templatedoc
-from ...fluid.layers.sequence_lod import sequence_mask  #noqa: F401
 from paddle import in_dynamic_mode
+from paddle import _C_ops
+from ...fluid.framework import _non_static_mode, _in_legacy_dygraph, in_dygraph_mode
+from ...fluid.data_feeder import check_variable_and_dtype, check_type
+from ...framework import core
+from ...common_ops_import import convert_np_dtype_to_dtype_
 
 __all__ = []
 
@@ -140,3 +144,240 @@ def __check_input(input, offset, dim1, dim2):
         outputs={'Out': [out]})
     out.stop_gradient = True
     return out
+
+
+def sequence_mask(x, maxlen=None, dtype='int64', name=None):
+    r"""
+    **SequenceMask Layer**
+
+    This layer outputs a mask according to the input :code:`x` and
+    :code:`maxlen` with data type of :code:`dtype`.
+
+    Supposing :code:`x` is a Tensor with shape [d_1, d_2, ..., d_n], the
+    :code:`y` is a mask with shape [d_1, d_2, ..., d_n, maxlen], where:
+
+    .. math::
+
+        y(i_1, i_2,..., i_n, j) = (j < x(i_1, i_2,..., i_n))
+
+    .. code-block:: text
+
+        Case:
+
+        Consider input:
+            x = [3, 1, 1, 0]    max_len = 4
+
+        then we get out:
+            mask = [[1, 1, 1, 0],
+                    [1, 0, 0, 0],
+                    [1, 0, 0, 0],
+                    [0, 0, 0, 0]]
+
+    Args:
+        x (Variable): Input tensor of sequence_mask layer, \
+            whose elements are integers less than :code:`maxlen`. \
+            Tensor or LodTensor with shape [d_1, d_2, ..., d_n].
+        maxlen (int, optional): Maximum length of the sequence. If :code:`maxlen` \
+                           is None, it would be replace with :math:`max(x)`.
+        dtype (np.dtype|paddle.dtype|str, optional): Data type of the output, \
+             ``int64`` by default.
+        name(str, optional): For detailed information, please refer \
+            to :ref:`api_guide_Name`. Usually name is no need to set and \
+            None by default.
+
+    Returns: The output sequence mask. Tensor with shape [d_1, d_2, ..., d_n, maxlen] \
+            and data type of :code:`dtype`. The data type should be bool, float32, float64, int8, \
+            int32 or int64.
+
+    Return Type: Tensor
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            lengths = paddle.to_tensor([10, 9, 8])
+            mask = paddle.nn.functional.sequence_mask(lengths)
+
+            print(mask.numpy())
+            # [[1 1 1 1 1 1 1 1 1 1]
+            #  [1 1 1 1 1 1 1 1 1 0]
+            #  [1 1 1 1 1 1 1 1 0 0]]
+
+    """
+
+    if in_dygraph_mode():
+        if not isinstance(dtype, core.VarDesc.VarType):
+            dtype = convert_np_dtype_to_dtype_(dtype)
+        if maxlen is not None:
+            if isinstance(maxlen, core.eager.Tensor):
+                attrs = ('out_dtype', dtype)
+                out = _C_ops.sequence_mask(x, maxlen, *attrs)
+            else:
+                attrs = ('out_dtype', dtype, 'maxlen', maxlen)
+                out = _C_ops.sequence_mask(x, None, *attrs)
+            out.stop_gradient = True
+            return out
+
+    helper = LayerHelper('sequence_mask', **locals())
+    out = helper.create_variable_for_type_inference(dtype=dtype)
+
+    inputs = {'X': [x]}
+    attrs = {'out_dtype': out.dtype}
+    if maxlen is not None:
+        if isinstance(maxlen, Variable):
+            inputs['MaxLenTensor'] = maxlen
+        else:
+            attrs['maxlen'] = maxlen
+
+    helper.append_op(
+        type='sequence_mask', inputs=inputs, outputs={'Y': out}, attrs=attrs)
+
+    out.stop_gradient = True
+    return out
+
+
+def gather_tree(ids, parents):
+    r"""
+    To be used after beam search. After beam search, we get selected ids at
+    each time step and the corresponding parents in the search tree. Both ids
+    and parents have the layout :attr:`[max_time, batch_size, beam_size]`. Then
+    :attr:`gather_tree` is used to backtrace from the last time step and
+    generate the full sequences by collecting selected ids.
+
+    Here is an example:
+
+    .. code-block:: text
+
+            Given:
+                ids = [[[2 2]
+                        [6 1]]
+                       [[3 9]
+                        [6 1]]
+                       [[0 1]
+                        [9 0]]]
+                parents = [[[0 0]
+                            [1 1]]
+                           [[1 0]
+                            [1 0]]
+                           [[0 0]
+                            [0 1]]]
+
+            Then:
+                gather_tree(ids, parents)
+                         = [[[2 2]
+                             [1 6]]
+                            [[3 3]
+                             [6 1]]
+                            [[0 1]
+                             [9 0]]]
+
+    Args:
+        ids(Tensor): A Tensor with shape :attr:`[length, batch_size, beam_size]`
+            and data type :attr:`int32` or :attr:`int64`. It contains the selected
+            ids of all time steps.
+        parents(Tensor): A Tensor with the same shape and data type as :attr:`ids`,
+            It contains the parents corresponding to selected ids when searching
+            among beams.
+
+    Returns:
+            A Tensor with the same shape and data type as :attr:`ids`. \
+            It contains the full sequences. The sequences are collected from \
+            :attr:`ids` by backtracing according to :attr:`parents`.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            ids = paddle.to_tensor([[[2, 2], [6, 1]], [[3, 9], [6, 1]], [[0, 1], [9, 0]]])
+
+            parents = paddle.to_tensor([[[0, 0], [1, 1]], [[1, 0], [1, 0]], [[0, 0], [0, 1]]])
+
+            final_sequences = paddle.nn.functional.gather_tree(ids, parents)
+            # [[[2, 2], [1, 6]], [[3, 3], [6, 1]], [[0, 1], [9, 0]]]
+
+    """
+    if in_dygraph_mode():
+        return _C_ops.final_state_gather_tree(ids, parents)
+    else:
+        if _in_legacy_dygraph():
+            return _C_ops.gather_tree(ids, parents)
+        else:
+            helper = LayerHelper('gather_tree', **locals())
+            check_variable_and_dtype(ids, 'ids', ['int32', 'int64'],
+                                     'gather_tree')
+            check_variable_and_dtype(parents, 'parents', ['int32', 'int64'],
+                                     'gather_tree')
+            out = helper.create_variable_for_type_inference(dtype=ids.dtype)
+
+            helper.append_op(
+                type="gather_tree",
+                inputs={"Ids": ids,
+                        "Parents": parents},
+                outputs={"Out": out})
+
+            return out
+
+
+@templatedoc()
+def temporal_shift(x, seg_num, shift_ratio=0.25, name=None, data_format="NCHW"):
+    """
+
+    **Temporal Shift Operator**
+
+    ${comment}
+
+    Args:
+        x(Tensor): ${x_comment}
+        seg_num(int): ${seg_num_comment}
+        shift_ratio(float): ${shift_ratio_comment}
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+        data_format(str, optional): Data format that specifies the layout of input.
+            It can be "NCHW" or "NHWC". Default: "NCHW".
+
+    Returns:
+        out(Tensor): The temporal shifting result is a tensor with the
+        same shape and same data type as the input.
+
+    Raises:
+        TypeError: seg_num must be int type.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+
+            input = paddle.randn([6, 4, 2, 2])
+            out = F.temporal_shift(x=input, seg_num=2, shift_ratio=0.2)
+    """
+    if data_format not in ["NCHW", "NHWC"]:
+        raise ValueError("Attr(data_format) should be 'NCHW' or 'NHWC'. "
+                         "Received Attr(data_format): {}.".format(data_format))
+    if _non_static_mode():
+        return _C_ops.temporal_shift(x, 'seg_num', seg_num, 'shift_ratio',
+                                     shift_ratio, 'data_format', data_format)
+
+    helper = LayerHelper("temporal_shift", **locals())
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'temporal_shift')
+    check_type(seg_num, 'seg_num', int, 'temporal_shift')
+    check_type(shift_ratio, 'shift_ratio', float, 'temporal_shift')
+
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+    if not isinstance(seg_num, int):
+        raise TypeError("seg_num must be int type.")
+
+    helper.append_op(
+        type="temporal_shift",
+        inputs={"X": x},
+        outputs={"Out": out},
+        attrs={
+            "seg_num": seg_num,
+            "shift_ratio": shift_ratio,
+            "data_format": data_format
+        })
+    return out
diff --git a/python/paddle/nn/functional/input.py b/python/paddle/nn/functional/input.py
index cfbf015ffa05f..92b3a7054d467 100644
--- a/python/paddle/nn/functional/input.py
+++ b/python/paddle/nn/functional/input.py
@@ -200,7 +200,9 @@ def embedding(x, weight, padding_idx=None, sparse=False, name=None):
         raise ValueError("padding_idx must be within [-{}, {})".format(
             weight.shape[0], weight.shape[0]))
 
-    if in_dynamic_mode():
+    if in_dygraph_mode():
+        return _C_ops.final_state_embedding(x, weight, padding_idx, sparse)
+    elif _in_legacy_dygraph():
         return _C_ops.lookup_table_v2(
             weight, x, 'is_sparse', sparse, 'is_distributed', False,
             'remote_prefetch', False, 'padding_idx', padding_idx)
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index d08821e510c2b..c0527a7a65201 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -21,15 +21,7 @@
 import paddle
 import paddle.fluid as fluid
 from ...fluid.layers.nn import _elementwise_op_in_dygraph
-from ...fluid.layers import dice_loss  # noqa: F401
-from ...fluid.layers import log_loss  # noqa: F401
-from ...fluid.layers import npair_loss  # noqa: F401
 from ...tensor.manipulation import reshape
-from ...fluid.layers import softmax_with_cross_entropy as fluid_softmax_with_cross_entropy
-from ...fluid.layers import square_error_cost  # noqa: F401
-
-from ...fluid.layers import edit_distance  # noqa: F401
-from ...fluid.layers import huber_loss
 from ...fluid.layer_helper import LayerHelper
 from ...fluid.framework import _varbase_creator
 from ...static import Variable
@@ -41,6 +33,518 @@
 __all__ = []
 
 
+def dice_loss(input, label, epsilon=0.00001, name=None):
+    r"""
+
+    Dice loss for comparing the similarity between the input predictions and the label.
+    This implementation is for binary classification, where the input is sigmoid
+    predictions of each pixel, usually used for segmentation task. The dice loss can
+    be defined as the following equation:
+
+    .. math::
+
+        dice\_loss &= 1 - \frac{2 * intersection\_area}{total\_area} \\
+                  &= \frac{(total\_area - intersection\_area) - intersection\_area}{total\_area} \\
+                  &= \frac{(union\_area - intersection\_area)}{total\_area}
+
+
+    Parameters:
+        input (Tensor): Tensor, rank>=2, shape is :math:`[N_1, N_2, ..., N_k, D]`, where :math:`N_1` is
+                          the batch_size, :math:`D` is the number of categories. It is usually the output
+                          predictions of sigmoid activation. The data type can be float32 or float64.
+        label (Tensor): Tensor, the groud truth with the same rank as input, shape is :math:`[N_1, N_2, ..., N_k, 1]`.
+                          where :math:`N_1` is the batch_size. The data type can be int32 or int64.
+        epsilon (float): The epsilon will be added to the numerator and denominator.
+                         If both input and label are empty, it makes sure dice is 1.
+                         Default: 0.00001
+        name(str, optional): The default value is None.
+                             Normally there is no need for user to set this property.
+                             For more information, please refer to :ref:`api_guide_Name`
+
+    Returns:
+        Tensor, which shape is [1], data type is the same as `input` .
+
+    Example:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+
+            x = paddle.randn((3,224,224,2))
+            label = paddle.randint(high=2, shape=(3,224,224,1))
+            predictions = F.softmax(x)
+            loss = F.dice_loss(input=predictions, label=label)
+    """
+    assert input.dtype in (paddle.float32, paddle.float64)
+    assert label.dtype in (paddle.int32, paddle.int64)
+    assert len(input.shape) >= 2, \
+        "The rank of input should be greater than or equal to 2."
+    assert len(input.shape) == len(label.shape), (
+        "The rank of input and label should be equal, "
+        "but received input: %d, label: %d." %
+        (len(input.shape), len(label.shape)))
+    assert label.shape[-1] == 1, ("The last dimension of label should be 1, "
+                                  "but received %d." % label.shape[-1])
+    assert input.shape[:-1] == label.shape[:-1], (
+        "All dimensions should be equal except the last one.")
+    assert input.numel() > 0 and label.numel() > 0, \
+        "Any dimension of input and label cannot be equal to 0."
+
+    label = paddle.squeeze(label, [-1])
+    label = paddle.nn.functional.one_hot(label, input.shape[-1])
+    reduce_dim = list(range(1, len(input.shape)))
+    inse = paddle.sum(input * label, axis=reduce_dim)
+    dice_denominator = paddle.sum(input, axis=reduce_dim) + paddle.sum(
+        label, axis=reduce_dim)
+    dice_score = 1 - inse * 2 / (dice_denominator + epsilon)
+    return paddle.mean(dice_score)
+
+
+def log_loss(input, label, epsilon=1e-4, name=None):
+    r"""
+
+    **Negative Log Loss Layer**
+
+    This layer accepts input predictions and target label and returns the
+    negative log loss.
+
+    .. math::
+
+        Out = -label * \log{(input + \epsilon)}
+              - (1 - label) * \log{(1 - input + \epsilon)}
+
+    Args:
+        input (Tensor|list):  A 2-D tensor with shape [N x 1], where N is the
+                                batch size. This input is a probability computed
+                                by the previous operator. Data type float32.
+        label (Tensor|list):  The ground truth which is a 2-D tensor with
+                                shape [N x 1], where N is the batch size.
+                                Data type float32.
+        epsilon (float, optional): A small number for numerical stability. Default 1e-4.
+        name(str|None): For detailed information, please refer to
+            :ref:`api_guide_Name` . Usually name is no need to set and None by default.
+
+    Returns:
+        Tensor, which shape is [N x 1], data type is float32.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import paddle.nn.functional as F
+
+          label = paddle.randn((10,1))
+          prob = paddle.randn((10,1))
+          cost = F.log_loss(input=prob, label=label)
+    """
+    if in_dygraph_mode():
+        return _C_ops.final_state_log_loss(input, label, epsilon)
+
+    helper = LayerHelper('log_loss', **locals())
+    check_variable_and_dtype(input, 'input', ['float32'], 'log_loss')
+    check_variable_and_dtype(label, 'label', ['float32'], 'log_loss')
+
+    loss = helper.create_variable_for_type_inference(dtype=input.dtype)
+
+    helper.append_op(
+        type='log_loss',
+        inputs={'Predicted': [input],
+                'Labels': [label]},
+        outputs={'Loss': [loss]},
+        attrs={'epsilon': epsilon})
+    return loss
+
+
+def fluid_softmax_with_cross_entropy(logits,
+                                     label,
+                                     soft_label=False,
+                                     ignore_index=-100,
+                                     numeric_stable_mode=True,
+                                     return_softmax=False,
+                                     axis=-1):
+    r"""
+
+    This operator implements the cross entropy loss function with softmax. This function 
+    combines the calculation of the softmax operation and the cross entropy loss function 
+    to provide a more numerically stable gradient.
+
+    Because this operator performs a softmax on logits internally, it expects
+    unscaled logits. This operator should not be used with the output of
+    softmax operator since that would produce incorrect results.
+
+    When the attribute :attr:`soft_label` is set :attr:`False`, this operators 
+    expects mutually exclusive hard labels, each sample in a batch is in exactly 
+    one class with a probability of 1.0. Each sample in the batch will have a 
+    single label.
+
+    The equation is as follows:
+
+    1) Hard label (one-hot label, so every sample has exactly one class)
+
+    .. math::
+
+        loss_j =  -\\text{logits}_{label_j} +
+        \\log\\left(\\sum_{i=0}^{K}\\exp(\\text{logits}_i)\\right), j = 1,..., K
+
+    2) Soft label (each sample can have a distribution over all classes)
+
+    .. math::
+
+        loss_j =  -\\sum_{i=0}^{K}\\text{label}_i
+        \\left(\\text{logits}_i - \\log\\left(\\sum_{i=0}^{K}
+        \\exp(\\text{logits}_i)\\right)\\right), j = 1,...,K
+
+    3) If :attr:`numeric_stable_mode` is :attr:`True`, softmax is calculated first by:
+
+    .. math::
+
+        max_j &= \\max_{i=0}^{K}{\\text{logits}_i}
+
+        log\\_max\\_sum_j &= \\log\\sum_{i=0}^{K}\\exp(logits_i - max_j)
+
+        softmax_j &= \\exp(logits_j - max_j - {log\\_max\\_sum}_j)
+
+    and then cross entropy loss is calculated by softmax and label.
+
+    Args:
+        logits (Tensor): A multi-dimension ``Tensor`` , and the data type is float32 or float64. The input tensor of unscaled log probabilities.
+        label (Tensor): The ground truth  ``Tensor`` , data type is the same
+            as the ``logits`` . If :attr:`soft_label` is set to :attr:`True`, 
+            Label is a ``Tensor``  in the same shape with :attr:`logits`. 
+            If :attr:`soft_label` is set to :attr:`True`, Label is a ``Tensor`` 
+            in the same shape with :attr:`logits` expect shape in dimension :attr:`axis` as 1.
+        soft_label (bool, optional): A flag to indicate whether to interpretant the given
+            labels as soft labels. Default False.
+        ignore_index (int, optional): Specifies a target value that is ignored and does
+                                      not contribute to the input gradient. Only valid
+                                      if :attr:`soft_label` is set to :attr:`False`. 
+                                      Default: kIgnoreIndex(-100).
+        numeric_stable_mode (bool, optional): A flag to indicate whether to use a more
+                                              numerically stable algorithm. Only valid
+                                              when :attr:`soft_label` is :attr:`False` 
+                                              and GPU is used. When :attr:`soft_label` 
+                                              is :attr:`True` or CPU is used, the 
+                                              algorithm is always numerically stable.
+                                              Note that the speed may be slower when use
+                                              stable algorithm. Default: True.
+        return_softmax (bool, optional): A flag indicating whether to return the softmax
+                                         along with the cross entropy loss. Default: False.
+        axis (int, optional): The index of dimension to perform softmax calculations. It 
+                              should be in range :math:`[-1, rank - 1]`, while :math:`rank`
+                              is the rank of input :attr:`logits`. Default: -1.
+
+    Returns:
+        ``Tensor`` or Tuple of two ``Tensor`` : Return the cross entropy loss if \
+                                                    `return_softmax` is False, otherwise the tuple \
+                                                    (loss, softmax), softmax is in the same shape \
+                                                    with input logits and cross entropy loss is in \
+                                                    the same shape with input logits except shape \
+                                                    in dimension :attr:`axis` as 1.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            data = np.random.rand(128).astype("float32")
+            label = np.random.rand(1).astype("int64")
+            data = paddle.to_tensor(data)
+            label = paddle.to_tensor(label)
+            linear = paddle.nn.Linear(128, 100)
+            x = linear(data)
+            out = paddle.nn.functional.softmax_with_cross_entropy(logits=x, label=label)
+            print(out)
+    """
+    if _non_static_mode():
+        if core.is_compiled_with_npu():
+            softmax, backprop, loss = _C_ops.softmax_with_cross_entropy(
+                logits, label, 'soft_label', soft_label, 'ignore_index',
+                ignore_index, 'numeric_stable_mode', numeric_stable_mode,
+                'axis', axis)
+        else:
+            if in_dygraph_mode():
+                softmax, loss = _C_ops.final_state_cross_entropy_with_softmax(
+                    logits, label, soft_label, True, numeric_stable_mode,
+                    ignore_index, axis)
+            if _in_legacy_dygraph():
+                softmax, loss = _C_ops.softmax_with_cross_entropy(
+                    logits, label, 'soft_label', soft_label, 'ignore_index',
+                    ignore_index, 'numeric_stable_mode', numeric_stable_mode,
+                    'axis', axis)
+        if not return_softmax:
+            return loss
+        else:
+            return loss, softmax
+
+    attrs = {
+        'soft_label': soft_label,
+        'ignore_index': ignore_index,
+        'numeric_stable_mode': numeric_stable_mode,
+        'axis': axis
+    }
+    helper = LayerHelper('softmax_with_cross_entropy', **locals())
+    softmax = helper.create_variable_for_type_inference(dtype=logits.dtype)
+    loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
+
+    outputs = {'Softmax': softmax, 'Loss': loss}
+    if core.is_compiled_with_npu() or core.is_compiled_with_mlu():
+        backprop = helper.create_variable_for_type_inference(dtype=logits.dtype)
+        outputs['Backprop'] = backprop
+    helper.append_op(
+        type='softmax_with_cross_entropy',
+        inputs={'Logits': logits,
+                'Label': label},
+        outputs=outputs,
+        attrs=attrs)
+
+    if return_softmax:
+        return loss, softmax
+
+    return loss
+
+
+def npair_loss(anchor, positive, labels, l2_reg=0.002):
+    """ 
+  
+    Npair loss requires paired data. Npair loss has two parts: the first part is L2
+    regularizer on the embedding vector; the second part is cross entropy loss which
+    takes the similarity matrix of anchor and positive as logits.
+  
+    For more information, please refer to:
+    `Improved Deep Metric Learning with Multi class N pair Loss Objective <http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/papers/nips16_npairmetriclearning.pdf>`_
+  
+    Args:
+      anchor(Tensor): embedding vector for the anchor image. shape=[batch_size, embedding_dims], 
+                        the data type is float32 or float64.
+      positive(Tensor): embedding vector for the positive image. shape=[batch_size, embedding_dims], 
+                        the data type is float32 or float64.
+      labels(Tensor): 1-D tensor. shape=[batch_size], the data type is float32 or float64 or int64.
+      l2_reg(float32): L2 regularization term on embedding vector, default: 0.002.
+
+  
+    Returns:
+      A Tensor representing the npair loss, the data type is the same as anchor, the shape is [1].
+  
+    Examples:
+
+      .. code-block:: python
+  
+          import paddle
+          
+          DATATYPE = "float32"
+  
+          anchor = paddle.rand(shape=(18, 6), dtype=DATATYPE)
+          positive = paddle.rand(shape=(18, 6), dtype=DATATYPE)
+          labels = paddle.rand(shape=(18,), dtype=DATATYPE)
+          
+          npair_loss = paddle.nn.functional.npair_loss(anchor, positive, labels, l2_reg = 0.002)
+          print(npair_loss)
+  
+    """
+    check_variable_and_dtype(anchor, 'anchor', ['float32', 'float64'],
+                             'npair_loss')
+    check_variable_and_dtype(positive, 'positive', ['float32', 'float64'],
+                             'positive')
+    check_variable_and_dtype(labels, 'labels', ['float32', 'float64', 'int64'],
+                             'labels')
+    Beta = 0.25
+    batch_size = labels.shape[0]
+
+    labels = paddle.reshape(labels, shape=[batch_size, 1])
+    labels = paddle.tile(labels, repeat_times=[1, batch_size])
+
+    labels = paddle.equal(
+        labels, paddle.transpose(
+            labels, perm=[1, 0])).astype('float32')
+    labels = labels / paddle.sum(labels, axis=1, keepdim=True)
+
+    l2loss = paddle.mean(paddle.sum(paddle.square(anchor), 1)) \
+             + paddle.mean(paddle.sum(paddle.square(positive), 1))
+    l2loss = l2loss * Beta * l2_reg
+
+    similarity_matrix = paddle.matmul(
+        anchor, positive, transpose_x=False, transpose_y=True)
+    softmax_ce = fluid_softmax_with_cross_entropy(
+        logits=similarity_matrix, label=labels, soft_label=True)
+    cross_entropy = paddle.sum(labels * softmax_ce, 0)
+    celoss = paddle.mean(cross_entropy)
+
+    return l2loss + celoss
+
+
+def square_error_cost(input, label):
+    r"""
+
+    This op accepts input predictions and target label and returns the
+    squared error cost.
+
+    For predictions label, and target label, the equation is:
+
+    .. math::
+
+        Out = (input - label)^2
+
+    Parameters:
+        input (Tensor): Input tensor, the data type should be float32.
+        label (Tensor): Label tensor, the data type should be float32.
+
+    Returns:
+        The tensor storing the element-wise squared error \
+                  difference between input and label.
+
+    Return type: Tensor.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            input = paddle.to_tensor([1.1, 1.9])
+            label = paddle.to_tensor([1.0, 2.0])
+            output = paddle.nn.functional.square_error_cost(input, label)
+            print(output)
+            # [0.01, 0.01]
+
+    """
+    if _non_static_mode():
+        minus_out = _C_ops.elementwise_sub(input, label)
+        square_out = _C_ops.square(minus_out)
+        return square_out
+
+    check_variable_and_dtype(input, "input", ['float32', 'float64'],
+                             'square_error_cost')
+    check_variable_and_dtype(label, "label", ['float32', 'float64'],
+                             'square_error_cost')
+    helper = LayerHelper('square_error_cost', **locals())
+    minus_out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    helper.append_op(
+        type='elementwise_sub',
+        inputs={'X': [input],
+                'Y': [label]},
+        outputs={'Out': [minus_out]})
+
+    square_out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    helper.append_op(
+        type='square', inputs={'X': [minus_out]},
+        outputs={'Out': [square_out]})
+    return square_out
+
+
+def edit_distance(input,
+                  label,
+                  normalized=True,
+                  ignored_tokens=None,
+                  input_length=None,
+                  label_length=None):
+    """
+    This op computes the edit distances, also called Levenshtein distance, between a batch of
+    hypothesis strings and their references. It measures how dissimilar two strings are by counting
+    the minimum number of operations to transform one string into another.
+    The operations include insertion, deletion, and substitution.
+
+    For example, given hypothesis string A = "kitten" and reference
+    B = "sitting", A will be transformed into B
+    at least after two substitutions and one insertion:
+
+    "kitten" -> "sitten" -> "sittin" -> "sitting"
+
+    So the edit distance between A and B is 3.
+
+    The input is a Tensor, the input_length and label_length should be supported.
+
+    The `batch_size` of labels should be same as `input`.
+
+    The output include the edit distance value between every pair of input and related label, and the number of sequence.
+    If Attr(normalized) is true,
+    the edit distance value will be divided by the length of label.
+
+    Parameters:
+        input(Tensor): The input tensor, its rank should be equal to 2 and its data type should be int64.
+        label(Tensor): The label tensor, its rank should be equal to 2 and its data type should be int64.
+        normalized(bool, default True): Indicated whether to normalize the edit distance.
+        ignored_tokens(list<int>, default None): Tokens that will be removed before
+                                     calculating edit distance.
+        input_length(Tensor): The length for each sequence in `input` if it's of Tensor type, it should have shape `(batch_size, )` and its data type should be int64.
+        label_length(Tensor): The length for each sequence in `label` if it's of Tensor type, it should have shape `(batch_size, )` and its data type should be int64.
+        NOTE: To be avoid unexpected result, the value of every elements in input_length and label_length should be equal to the value of the second dimension of input and label. For example, The input: [[1,2,3,4],[5,6,7,8],[9,10,11,12]], the shape of input is [3,4] and the input_length should be [4,4,4]
+        NOTE: This Api is different from fluid.metrics.EditDistance
+
+    Returns:
+	Tuple:
+
+        distance(Tensor): edit distance result, its data type is float32, and its shape is (batch_size, 1).
+        sequence_num(Tensor): sequence number, its data type is float32, and its shape is (1,).
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+
+            input = paddle.to_tensor([[1,2,3],[4,5,6],[4,4,4],[1,1,1]], dtype='int64')
+            label = paddle.to_tensor([[1,3,4,1],[4,5,8,1],[7,7,7,1],[1,1,1,1]], dtype='int64')
+            input_len = paddle.to_tensor([3,3,3,3], dtype='int64')
+            label_len = paddle.to_tensor([4,4,4,4], dtype='int64')
+
+            distance, sequence_num = F.loss.edit_distance(input=input, label=label, input_length=input_len, label_length=label_len, normalized=False)
+
+            # print(distance)
+            # [[3.]
+            #  [2.]
+            #  [4.]
+            #  [1.]]
+            # if set normalized to True
+            # [[0.75]
+            #  [0.5 ]
+            #  [1.  ]
+            #  [0.25]
+            #
+            # print(sequence_num)
+            # [4]
+
+    """
+    check_variable_and_dtype(input, 'input', ['int64'], 'edit_distance')
+    check_variable_and_dtype(label, 'label', ['int64'], 'edit_distance')
+    helper = LayerHelper("edit_distance", **locals())
+
+    # remove some tokens from input and labels
+    if ignored_tokens is not None and len(ignored_tokens) > 0:
+        erased_input = helper.create_variable_for_type_inference(dtype="int64")
+        erased_label = helper.create_variable_for_type_inference(dtype="int64")
+
+        helper.append_op(
+            type="sequence_erase",
+            inputs={"X": [input]},
+            outputs={"Out": [erased_input]},
+            attrs={"tokens": ignored_tokens})
+        input = erased_input
+
+        helper.append_op(
+            type="sequence_erase",
+            inputs={"X": [label]},
+            outputs={"Out": [erased_label]},
+            attrs={"tokens": ignored_tokens})
+        label = erased_label
+
+    this_inputs = {"Hyps": [input], "Refs": [label]}
+    if input_length is not None and label_length is not None:
+        this_inputs['HypsLength'] = [input_length]
+        this_inputs['RefsLength'] = [label_length]
+
+    # edit distance op
+    edit_distance_out = helper.create_variable_for_type_inference(dtype="int64")
+    sequence_num = helper.create_variable_for_type_inference(dtype="int64")
+    helper.append_op(
+        type="edit_distance",
+        inputs=this_inputs,
+        outputs={"Out": [edit_distance_out],
+                 "SequenceNum": [sequence_num]},
+        attrs={"normalized": normalized})
+
+    return edit_distance_out, sequence_num
+
+
 def binary_cross_entropy(input, label, weight=None, reduction='mean',
                          name=None):
     """
@@ -138,10 +642,10 @@ def binary_cross_entropy(input, label, weight=None, reduction='mean',
             else:
                 return out
         else:
-            fluid.data_feeder.check_variable_and_dtype(
-                input, 'input', ['float32', 'float64'], 'binary_cross_entropy')
-            fluid.data_feeder.check_variable_and_dtype(
-                label, 'label', ['float32', 'float64'], 'binary_cross_entropy')
+            check_variable_and_dtype(input, 'input', ['float32', 'float64'],
+                                     'binary_cross_entropy')
+            check_variable_and_dtype(label, 'label', ['float32', 'float64'],
+                                     'binary_cross_entropy')
 
             sub_name = name if weight is None and reduction == 'none' else None
             helper = LayerHelper("binary_cross_entropy", name=sub_name)
@@ -288,12 +792,10 @@ def binary_cross_entropy_with_logits(logit,
         else:
             return out
 
-    fluid.data_feeder.check_variable_and_dtype(
-        logit, 'logit', ['float32', 'float64'],
-        'binary_cross_entropy_with_logits')
-    fluid.data_feeder.check_variable_and_dtype(
-        label, 'label', ['float32', 'float64'],
-        'binary_cross_entropy_with_logits')
+    check_variable_and_dtype(logit, 'logit', ['float32', 'float64'],
+                             'binary_cross_entropy_with_logits')
+    check_variable_and_dtype(label, 'label', ['float32', 'float64'],
+                             'binary_cross_entropy_with_logits')
     sigmoid_name = None
     if reduction == 'none' and pos_weight is None and weight is None:
         sigmoid_name = name
@@ -303,18 +805,17 @@ def binary_cross_entropy_with_logits(logit,
 
     one = paddle.full(shape=[1], fill_value=1.0, dtype=logit.dtype)
     if pos_weight is not None:
-        fluid.data_feeder.check_variable_and_dtype(
-            pos_weight, 'pos_weight', ['float32', 'float64'],
-            'binary_cross_entropy_with_logits')
+        check_variable_and_dtype(pos_weight, 'pos_weight',
+                                 ['float32', 'float64'],
+                                 'binary_cross_entropy_with_logits')
         log_weight = paddle.add(
             paddle.multiply(label, paddle.subtract(pos_weight, one)), one)
         pos_weight_name = name if reduction == 'none' and weight is None else None
         out = paddle.multiply(out, log_weight, name=pos_weight_name)
 
     if weight is not None:
-        fluid.data_feeder.check_variable_and_dtype(
-            weight, 'weight', ['float32', 'float64'],
-            'binary_cross_entropy_with_logits')
+        check_variable_and_dtype(weight, 'weight', ['float32', 'float64'],
+                                 'binary_cross_entropy_with_logits')
         weight_name = name if reduction == 'none' else None
         out = paddle.multiply(out, weight, name=weight_name)
 
@@ -519,12 +1020,26 @@ def smooth_l1_loss(input, label, reduction='mean', delta=1.0, name=None):
             output = paddle.nn.functional.smooth_l1_loss(input, label)
             print(output)
     """
-    fluid.data_feeder.check_variable_and_dtype(
-        input, 'input', ['float32', 'float64'], 'smooth_l1_loss')
-    fluid.data_feeder.check_variable_and_dtype(
-        label, 'label', ['float32', 'float64'], 'smooth_l1_loss')
+    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
+                             'smooth_l1_loss')
+    check_variable_and_dtype(label, 'label', ['float32', 'float64'],
+                             'smooth_l1_loss')
 
-    out = huber_loss(input=input, label=label, delta=delta)
+    if in_dygraph_mode():
+        out, residual = _C_ops.final_state_huber_loss(input, label, delta)
+    else:
+        helper = LayerHelper('huber_loss', **locals())
+        residual = helper.create_variable_for_type_inference(
+            dtype=helper.input_dtype())
+        out = helper.create_variable_for_type_inference(
+            dtype=helper.input_dtype())
+        helper.append_op(
+            type='huber_loss',
+            inputs={'X': input,
+                    'Y': label},
+            outputs={'Out': out,
+                     'Residual': residual},
+            attrs={'delta': delta})
 
     if reduction not in ['sum', 'mean', 'none']:
         raise ValueError(
@@ -615,12 +1130,12 @@ def margin_ranking_loss(input,
         return out
 
     helper = LayerHelper("margin_ranking_loss", **locals())
-    fluid.data_feeder.check_variable_and_dtype(
-        input, 'input', ['float32', 'float64'], 'margin_rank_loss')
-    fluid.data_feeder.check_variable_and_dtype(
-        other, 'other', ['float32', 'float64'], 'margin_rank_loss')
-    fluid.data_feeder.check_variable_and_dtype(
-        label, 'label', ['float32', 'float64'], 'margin_rank_loss')
+    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
+                             'margin_rank_loss')
+    check_variable_and_dtype(other, 'other', ['float32', 'float64'],
+                             'margin_rank_loss')
+    check_variable_and_dtype(label, 'label', ['float32', 'float64'],
+                             'margin_rank_loss')
 
     out = paddle.subtract(other, input)
     out = paddle.multiply(out, label)
@@ -738,9 +1253,9 @@ def l1_loss(input, label, reduction='mean', name=None):
         else:
             return unreduced
 
-    fluid.data_feeder.check_variable_and_dtype(
+    check_variable_and_dtype(
         input, 'input', ['float32', 'float64', 'int32', 'int64'], 'l1_loss')
-    fluid.data_feeder.check_variable_and_dtype(
+    check_variable_and_dtype(
         label, 'label', ['float32', 'float64', 'int32', 'int64'], 'l1_loss')
 
     if reduction == 'sum':
@@ -847,10 +1362,8 @@ def nll_loss(input,
         label = reshape(label, shape=[n, 1, -1])
         out_shape = [n] + input_shape[2:]
 
-    fluid.data_feeder.check_variable_and_dtype(
-        input, 'input', ['float32', 'float64'], 'nll_loss')
-    fluid.data_feeder.check_variable_and_dtype(label, 'label', ['int64'],
-                                               'nll_loss')
+    check_variable_and_dtype(input, 'input', ['float32', 'float64'], 'nll_loss')
+    check_variable_and_dtype(label, 'label', ['int64'], 'nll_loss')
     inputs = {'X': input, 'Label': label}
     attrs = {'reduction': reduction, 'ignore_index': ignore_index}
     if weight is not None:
@@ -971,10 +1484,8 @@ def kl_div(input, label, reduction='mean', name=None):
 
     helper = LayerHelper('kl_div', **locals())
 
-    fluid.data_feeder.check_variable_and_dtype(input, 'input',
-                                               ['float32', 'float64'], 'kl_div')
-    fluid.data_feeder.check_variable_and_dtype(label, 'label',
-                                               ['float32', 'float64'], 'kl_div')
+    check_variable_and_dtype(input, 'input', ['float32', 'float64'], 'kl_div')
+    check_variable_and_dtype(label, 'label', ['float32', 'float64'], 'kl_div')
     fluid.data_feeder.check_type(reduction, 'reduction', str, 'kl_div')
 
     loss = helper.create_variable_for_type_inference(dtype=input.dtype)
@@ -1051,10 +1562,10 @@ def mse_loss(input, label, reduction='mean', name=None):
             "but received {}.".format(reduction))
 
     if not in_dynamic_mode():
-        paddle.fluid.data_feeder.check_variable_and_dtype(
-            input, 'input', ['float32', 'float64'], 'mse_loss')
-        paddle.fluid.data_feeder.check_variable_and_dtype(
-            label, 'label', ['float32', 'float64'], 'mse_loss')
+        check_variable_and_dtype(input, 'input', ['float32', 'float64'],
+                                 'mse_loss')
+        check_variable_and_dtype(label, 'label', ['float32', 'float64'],
+                                 'mse_loss')
 
     if reduction == 'none':
         return paddle.square(paddle.subtract(input, label), name=name)
@@ -1858,9 +2369,9 @@ def cross_entropy(input,
                 out = paddle.squeeze(out, axis=axis)
             return out
 
-    fluid.data_feeder.check_variable_and_dtype(
-        input, 'input', ['float32', 'float64'], 'softmax_cross_entropy')
-    fluid.data_feeder.check_variable_and_dtype(
+    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
+                             'softmax_cross_entropy')
+    check_variable_and_dtype(
         label, 'label',
         ['uint8', 'int8', 'int16', 'int32', 'int64', 'float32', 'float64'],
         'softmax_cross_entropy')
@@ -1887,8 +2398,8 @@ def cross_entropy(input,
         attrs=attrs)
 
     if weight is not None:
-        fluid.data_feeder.check_variable_and_dtype(
-            weight, 'weight', ['float32', 'float64'], 'softmax_cross_entropy')
+        check_variable_and_dtype(weight, 'weight', ['float32', 'float64'],
+                                 'softmax_cross_entropy')
         weight_name = name if reduction == 'none' else None
         if soft_label == True:
             # chajchaj:
@@ -2050,9 +2561,8 @@ def sigmoid_focal_loss(logit,
             % reduction)
 
     if normalizer is not None:
-        fluid.data_feeder.check_variable_and_dtype(normalizer, 'normalizer',
-                                                   ['float32', 'float64'],
-                                                   'sigmoid_focal_loss')
+        check_variable_and_dtype(normalizer, 'normalizer',
+                                 ['float32', 'float64'], 'sigmoid_focal_loss')
         normalizer_shape = list(normalizer.shape)
         normalizer_dims = len(normalizer_shape)
         if normalizer_dims > 1:
@@ -2102,10 +2612,10 @@ def sigmoid_focal_loss(logit,
 
         return loss
 
-    fluid.data_feeder.check_variable_and_dtype(
-        logit, 'logit', ['float32', 'float64'], 'sigmoid_focal_loss')
-    fluid.data_feeder.check_variable_and_dtype(
-        label, 'label', ['float32', 'float64'], 'sigmoid_focal_loss')
+    check_variable_and_dtype(logit, 'logit', ['float32', 'float64'],
+                             'sigmoid_focal_loss')
+    check_variable_and_dtype(label, 'label', ['float32', 'float64'],
+                             'sigmoid_focal_loss')
 
     bce_name = None
     if reduction == 'none' and normalizer is None:
diff --git a/python/paddle/nn/initializer/normal.py b/python/paddle/nn/initializer/normal.py
index 6fee5058057cb..2d0cd77ee17e9 100644
--- a/python/paddle/nn/initializer/normal.py
+++ b/python/paddle/nn/initializer/normal.py
@@ -60,19 +60,19 @@ def __init__(self, mean=0.0, std=1.0, name=None):
 
 
 class TruncatedNormal(TruncatedNormalInitializer):
-    """The Random TruncatedNormal (Gaussian) distribution initializer.
+    """The truncated normal distribution (Gaussian distribution) initializer.
 
     Args:
-        mean (float, optional): mean of the normal distribution. The default value is 0.0.
-        std (float, optional): standard deviation of the normal distribution. The default value is 1.0.
-        name(str, optional): The default value is None. Normally there is no need for user to set this
-            property. For more information, please refer to :ref:`api_guide_Name`.
+        mean (float, optional): Mean of the normal distribution. The default value is :math:`0.0`.
+        std (float, optional): Standard deviation of the normal distribution. The default value is :math:`1.0`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
-        A parameter initialized by Random TruncatedNormal (Gaussian) distribution.
+        A parameter initialized by truncated normal distribution (Gaussian distribution).
 
     Examples:
         .. code-block:: python
+            :name: initializer_TruncatedNormal-example
 
             import paddle
 
diff --git a/python/paddle/nn/initializer/uniform.py b/python/paddle/nn/initializer/uniform.py
index cac03b5948071..f07883adbb0ae 100644
--- a/python/paddle/nn/initializer/uniform.py
+++ b/python/paddle/nn/initializer/uniform.py
@@ -18,19 +18,19 @@
 
 
 class Uniform(UniformInitializer):
-    """The random uniform distribution initializer.
+    """The uniform distribution initializer.
 
     Args:
-        low (float, optional): lower boundary of the uniform distribution. The default value is -1.0.
-        high (float, optional): upper boundary of the uniform distribution. The default value is 1.0.
-        name(str, optional): The default value is None. Normally there is no need for user to set this
-            property. For more information, please refer to :ref:`api_guide_Name`.
+        low (float, optional): Lower boundary of the uniform distribution. The default value is :math:`-1.0`.
+        high (float, optional): Upper boundary of the uniform distribution. The default value is :math:`1.0`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
-        A parameter initialized by random uniform distribution.
+        A parameter initialized by uniform distribution.
 
     Examples:
         .. code-block:: python
+            :name: initializer_Uniform-example
 
             import paddle
 
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index 31364f0281c8a..cca8c37645df6 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -26,6 +26,7 @@
 from .activation import Sigmoid  # noqa: F401
 from .activation import Softmax  # noqa: F401
 from .activation import LogSoftmax  # noqa: F401
+from .activation import RReLU  # noqa: F401
 from .activation import Softmax2D  # noqa: F401
 from .common import Bilinear  # noqa: F401
 from .common import Pad1D  # noqa: F401
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index 7fd109843bede..1a3768e919042 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -436,6 +436,93 @@ def extra_repr(self):
             name_str)
 
 
+class RReLU(Layer):
+    r"""
+    RReLU activation layer.
+
+    Applies the randomized leaky rectified liner unit function to improve generalization performance,
+    as described in the paper:
+    `Empirical Evaluation of Rectified Activations in Convolutional Network <https://arxiv.org/abs/1505.00853>`_
+
+    During training, randomly samples the negative slope for activation values as described below:
+
+    .. math::
+
+        RReLU(x)=
+            \left\{
+                \begin{array}{rcl}
+                    x, & & if \ x >= 0 \\
+                    a * x, & & otherwise \\
+                \end{array}
+            \right.
+
+    where :math:`x` is the input tensor,
+    :math:`a` is randomly sampled from uniform distribution in range (:math:`lower`, :math:`upper`),
+
+    In the test phase, the negative slope will take the average value of :math:`lower` and :math:`upper`:
+
+    .. math::
+
+        RReLU(x)=
+            \left\{
+                \begin{array}{rcl}
+                    x, & & if \ x >= 0 \\
+                    (lower + upper) * 0.5 * x, & & otherwise \\
+                \end{array}
+            \right.
+
+    where :math:`x` is the input tensor,
+    :math:`lower` and :math:`upper` are the bounds of uniform distribution.
+
+    Parameters:
+        lower (float, optional): The lower bound of uniform distribution. Default: 0.125.
+        upper (float, optional): The upper bound of uniform distribution. Default: 0.333.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape. Default dtype is float32.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+            :name: RReLU-example
+
+            import paddle
+
+            input_tensor = paddle.to_tensor([[[[-2.0,  3.0, -4.0,  5.0],
+                                            [ 3.0, -4.0,  5.0, -6.0],
+                                            [-7.0, -8.0,  8.0,  9.0]],
+                                            [[ 1.0, -2.0, -3.0,  4.0],
+                                            [-5.0,  6.0,  7.0, -8.0],
+                                            [ 6.0,  7.0,  8.0,  9.0]]]], dtype='float32')
+
+            rrelu_layer = paddle.nn.RReLU(0.1, 0.3)
+            output = rrelu_layer(input_tensor)
+            #[[[[-0.20000899  3.         -0.88108218  5.        ]
+            #   [ 3.         -0.55175185  5.         -1.07761011]
+            #   [-1.06806871 -1.98962009  8.          9.        ]]
+            #  [[ 1.         -0.52382672 -0.65515128  4.        ]
+            #   [-1.37663394  6.          7.         -2.34657836]
+            #   [ 6.          7.          8.          9.        ]]]]
+    """
+
+    def __init__(self, lower=1. / 8., upper=1. / 3., name=None):
+        super(RReLU, self).__init__()
+        self._lower = lower
+        self._upper = upper
+        self._name = name
+
+    def forward(self, x):
+        return F.rrelu(
+            x, lower=self._lower, upper=self._upper, training=self.training)
+
+    def extra_repr(self):
+        name_str = ', name={}'.format(self._name) if self._name else ''
+        return 'lower={}, upper={}, training={}, dtype={}{}'.format(
+            self._lower, self._upper, self.training, self._dtype, name_str)
+
+
 class ReLU(Layer):
     """
     ReLU Activation.
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 7c3e3ad8dee9f..6cdfc36d5d61f 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -32,7 +32,7 @@
 from ...fluid.dygraph import BatchNorm  # noqa: F401
 from ...fluid.dygraph import SpectralNorm  # noqa: F401
 
-from ...framework import get_default_dtype, set_default_dtype
+from ...framework import get_default_dtype, set_default_dtype, _non_static_mode
 
 from ..initializer import Constant
 from ...framework import ParamAttr
@@ -404,6 +404,25 @@ def __init__(self,
             self.bias.stop_gradient = self._bias_attr != None and self._bias_attr.learning_rate == 0.
 
     def forward(self, input):
+        mean_out = self._helper.create_variable_for_type_inference(
+            dtype=input.dtype, stop_gradient=True)
+        variance_out = self._helper.create_variable_for_type_inference(
+            dtype=input.dtype, stop_gradient=True)
+
+        if _non_static_mode():
+            pre_act, _, _ = _C_ops.group_norm(
+                input,
+                self.weight,
+                self.bias,
+                mean_out,
+                variance_out,
+                'epsilon',
+                self._epsilon,
+                'groups',
+                self._num_groups, )
+            return dygraph_utils._append_activation_in_dygraph(
+                pre_act, act=None)
+
         inputs = {'X': input}
         if self.bias is not None:
             inputs['Bias'] = self.bias
@@ -411,10 +430,6 @@ def forward(self, input):
             inputs['Scale'] = self.weight
 
         # create output
-        mean_out = self._helper.create_variable_for_type_inference(
-            dtype=input.dtype, stop_gradient=True)
-        variance_out = self._helper.create_variable_for_type_inference(
-            dtype=input.dtype, stop_gradient=True)
         group_norm_out = self._helper.create_variable_for_type_inference(
             dtype=input.dtype)
 
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
index 0fa49745a95fb..0b61f3cb9a787 100644
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -12,11 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import warnings
+from collections import defaultdict
 from .optimizer import Optimizer
-from .adam import Adam
+from .lr import LRScheduler
 from ..fluid import core
 from ..fluid import framework
-from ..fluid.framework import Variable
+from ..fluid.framework import Variable, Parameter
+from ..fluid import unique_name
+from ..fluid import layers
+from ..fluid.layer_helper import LayerHelper
+from ..fluid.clip import GradientClipBase
 from ..fluid.dygraph import base as imperative_base
 from collections.abc import Callable
 from .. import _C_ops
@@ -25,7 +31,7 @@
 __all__ = []
 
 
-class AdamW(Adam):
+class AdamW(Optimizer):
     r"""
     The AdamW optimizer is implemented based on the AdamW Optimization
     in paper `DECOUPLED WEIGHT DECAY REGULARIZATION <https://arxiv.org/pdf/1711.05101.pdf>`_.
@@ -102,14 +108,14 @@ class AdamW(Adam):
             beta1 = paddle.to_tensor([0.9], dtype="float32")
             beta2 = paddle.to_tensor([0.99], dtype="float32")
 
-            adam = paddle.optimizer.AdamW(learning_rate=0.1,
+            opt = paddle.optimizer.AdamW(learning_rate=0.1,
                     parameters=linear.parameters(),
                     beta1=beta1,
                     beta2=beta2,
                     weight_decay=0.01)
             out.backward()
-            adam.step()
-            adam.clear_grad()
+            opt.step()
+            opt.clear_grad()
 
 
             #Note that the learning_rate of linear_2 is 0.01.
@@ -119,7 +125,7 @@ class AdamW(Adam):
             out = linear_1(inp)
             out = linear_2(out)
             loss = paddle.mean(out)
-            adam = paddle.optimizer.AdamW(
+            opt = paddle.optimizer.AdamW(
                 learning_rate=0.1,
                 parameters=[{
                     'params': linear_1.parameters()
@@ -132,11 +138,16 @@ class AdamW(Adam):
                 weight_decay=0.01,
                 beta1=0.9)                   
             out.backward()
-            adam.step()
-            adam.clear_grad()
+            opt.step()
+            opt.clear_grad()
 
     """
 
+    _moment1_acc_str = "moment1"
+    _moment2_acc_str = "moment2"
+    _beta1_pow_acc_str = "beta1_pow_acc"
+    _beta2_pow_acc_str = "beta2_pow_acc"
+
     def __init__(self,
                  learning_rate=0.001,
                  beta1=0.9,
@@ -160,37 +171,108 @@ def __init__(self,
             raise ValueError("Invaild value of beta2, expect beta2 in [0,1).")
         if not 0 <= epsilon:
             raise ValueError("Invaild value of epsilon, expect epsilon >= 0.")
-        coeff = weight_decay
-        if not isinstance(coeff, float) and \
-                not isinstance(coeff, framework.Variable):
-            raise TypeError("coeff should be float or Tensor.")
-        self._params_name = set()
-        self._apply_decay_param_fun = apply_decay_param_fun
-        self._coeff = coeff
-        self._lr_to_coeff = dict()
+        if not isinstance(weight_decay, float) and \
+                not isinstance(weight_decay, framework.Variable):
+            raise TypeError("weight_decay should be float or Tensor.")
         if lr_ratio is not None:
             assert isinstance(lr_ratio, Callable)
             if not core.is_compiled_with_cuda():
                 raise NotImplementedError(
                     "'lr_ratio' is unimplemented in CPU, XPU and NPU")
-        self._lr_ratio = lr_ratio
 
-        super(AdamW, self).__init__(
-            learning_rate=learning_rate,
-            parameters=parameters,
-            beta1=beta1,
-            beta2=beta2,
-            epsilon=epsilon,
-            grad_clip=grad_clip,
-            name=name,
-            lazy_mode=lazy_mode,
-            multi_precision=multi_precision)
-        self._default_dict = {'coeff': coeff}
+        if parameters is not None:
+            # paddle.Tensor is also iterable, so here we don't check whether
+            # the input is iterable, if the input is paddle.Tensor, the
+            # list(paddle.Tensor) will be a error value
+            if isinstance(parameters, (paddle.Tensor, core.eager.Tensor)):
+                raise TypeError(
+                    "`parameters` argument given to the optimizer should be "
+                    "an iterable of paddle Tensors, but got argument type is `{}`.".
+                    format(type(parameters)))
+            if isinstance(parameters, dict):
+                raise TypeError(
+                    "`parameters` argument should not get dict type, "
+                    "if parameter groups is needed, please set `parameters`"
+                    " as list of dict")
+            self._parameter_list = list(parameters)
+        else:
+            self._parameter_list = None
+
+        self._name = name
+        if framework._non_static_mode():
+            if self._parameter_list is None:
+                raise AttributeError(
+                    "parameters argument given to the Optimizer should not be None in dygraph mode."
+                )
+
+        if not isinstance(learning_rate, (float, LRScheduler)):
+            raise TypeError(
+                "learning rate should be float or LRScheduler, got %s here" %
+                type(learning_rate))
+        if grad_clip is not None:
+            if not isinstance(grad_clip, GradientClipBase):
+                raise TypeError(
+                    "'grad_clip' should be an instance of GradientClipBase's derived class"
+                )
+
+        self._dtype = None
+        # Infer the dtype form parameter
+        if self._parameter_list:
+            if isinstance(self._parameter_list[0], dict):
+                for param_group in self._parameter_list:
+                    assert 'params' in param_group, \
+                        'params should be set in parameters if parameter groups are optimized in different options'
+                self._dtype = self._parameter_list[0]['params'][0].dtype
+            else:
+                self._dtype = self._parameter_list[0].dtype
+
+        # each program should have a independent learning rate
+        # program -> tensor(learning_rate)
+        self._learning_rate_map = dict()
+        # Dictionary of accumulators. Some optimizer subclasses need to
+        # allocate and manage extra tensors associated with the parameters
+        # to train. These tensors are called accumulators.
+        # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
+        self._accumulators = defaultdict(lambda: dict())
+        self.helper = None
+        self._opti_name_list = []
+        self._accumulators_holder = {}
+        self._param_device_map = dict()
+        self.clear_gradients = self.clear_grad
 
         self.type = "adamw"
+        self._learning_rate = learning_rate
+        self._params_name = set()
+        self._apply_decay_param_fun = apply_decay_param_fun
+        self._weight_decay = weight_decay
+        self._grad_clip = grad_clip
+        self._lr_ratio = lr_ratio
+        self._beta1 = beta1
+        self._beta2 = beta2
+        self._epsilon = epsilon
+        self._lazy_mode = lazy_mode
+        self._multi_precision = multi_precision
+        self._master_weights = {}
+
+        self._default_dict = {
+            'weight_decay': weight_decay,
+            'beta1': beta1,
+            'beta2': beta2,
+            'epsilon': epsilon,
+            'lazy_mode': lazy_mode,
+            'grad_clip': grad_clip
+        }
+
+        self._param_groups = []
+        if self._parameter_list and isinstance(self._parameter_list[0], dict):
+            for param_group in self._parameter_list:
+                self._add_param_group(param_group.copy())
+        else:
+            self._param_groups = self._parameter_list
 
-        # Use _auxiliary_vars together with _set_auxiliary_var/_get_auxiliary_var to achieve that.
-        self._auxiliary_vars = dict()
+        self._use_multi_tensor = None
+        self.regularization = None
+        self._auxiliary_vars = {}
 
     def _set_auxiliary_var(self, key, val):
         self._auxiliary_vars[key] = val
@@ -201,58 +283,128 @@ def _get_auxiliary_var(self, key):
         else:
             return None
 
-    def _append_decoupled_weight_decay(self, block, param_and_grad):
+    def _add_param_group(self, param_group):
         """
-        Add decoupled weight decay op.
-            parameter = parameter - parameter * coeff * lr
+        Add a param group to parameter_list.
+
         Args:
-            block: block in which variable is to be created
-            param_and_grad: (parameters, gradients) pairs,
-                the parameters need to decay.
-        Raises:
-            Exception: The type of coeff and parameter is not consistent.
+            param_group (dict): The group of Tensors to be optimzed with
+            different optimization options.
         """
-        if isinstance(param_and_grad, dict):
-            param_and_grad = self._update_param_group(param_and_grad)
-        param, grad = param_and_grad
+        params = param_group['params']
+        if isinstance(params, Parameter):
+            param_group['params'] = [params]
+        elif isinstance(params, set):
+            raise TypeError(
+                "optimizer parameters should be in ordered collections,"
+                "but received set, please use list instead.")
+        else:
+            param_group['params'] = list(params)
 
-        if self._apply_decay_param_fun is not None \
-                and not self._apply_decay_param_fun(param.name):
-            return
+        # Update optimization options for each groups
+        for k, v in self._default_dict.items():
+            param_group.setdefault(k, v)
+
+        param_set = set()
+        for group in self._param_groups:
+            param_set.update(set(group['params']))
+
+        if not param_set.isdisjoint(set(param_group['params'])):
+            raise ValueError(
+                "some parameters appear in more than one parameter group")
 
-        if isinstance(self._learning_rate, float):
-            learning_rate = self._learning_rate
+        for param in param_group['params']:
+            param.optimize_attr['learning_rate'] = param_group.get(
+                'learning_rate', 1.)
+
+        self._param_groups.append(param_group)
+
+    def _create_master_weight(self, param):
+        if param.name in self._master_weights:
+            var = self._master_weights[param.name]
         else:
-            # NOTE. We add this function to the _append_optimize_op(),
-            # for we must make sure _create_param_lr() be called after
-            # optimizer._create_global_learning_rate().
-            learning_rate = self._create_param_lr(param_and_grad)
-
-        with block.program._optimized_guard(
-            [param, grad]), framework.name_scope('weight decay'):
-            self._params_name.add(param.name)
-
-            # If it has been calculated, the result will be reused.
-            # NOTE(wangxi): In dygraph mode, apply_gradient will be executed
-            # every step, so need clear _lr_to_coeff every step,
-            # we do this in _create_optimization_pass
-            decay_coeff = self._lr_to_coeff.get(learning_rate, None)
-            if decay_coeff is None:
-                # NOTE(wangxi): for pipeline to set device:all
-                with paddle.static.device_guard(None):
-                    decay_coeff = 1.0 - learning_rate * self._coeff
-                self._lr_to_coeff[learning_rate] = decay_coeff
-
-            find_master = (self._multi_precision and
-                           param.dtype == core.VarDesc.VarType.FP16)
-            if find_master:
-                master_weight = self._master_weights[param.name]
-                scaled_param = master_weight * decay_coeff
-                paddle.fluid.layers.assign(
-                    input=scaled_param, output=master_weight)
-            else:
-                scaled_param = param * decay_coeff
-                paddle.fluid.layers.assign(input=scaled_param, output=param)
+            assert isinstance(self.helper, LayerHelper)
+
+            var_name = param.name + "_fp32_master"
+            var_name = unique_name.generate(var_name)
+            var = layers.create_global_var(
+                name=var_name,
+                shape=param.shape,
+                value=0,
+                dtype='float32',
+                persistable=True)
+            block = self.helper.startup_program.global_block()
+            block.append_op(
+                type="cast",
+                inputs={"X": [param]},
+                outputs={"Out": [var]},
+                attrs={
+                    "in_dtype": param.dtype,
+                    "out_dtype": core.VarDesc.VarType.FP32
+                })
+            self._master_weights[param.name] = var
+        return var
+
+    def _get_accumulator(self, name, param):
+        """Utility function to fetch an accumulator for a parameter
+        Args:
+            name: name of the accumulator
+            param: parameter variable for which accumulator is to be fetched
+        Returns:
+            accumulator variable for the parameter
+        """
+        if self._name is not None:
+            name = self._name + "_" + name
+        find_master = self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
+        target_param = self._master_weights[
+            param.name] if find_master else param
+        target_name = target_param.name
+        if (name not in self._accumulators or
+                target_name not in self._accumulators[name]):
+            raise Exception("Accumulator {} does not exist for parameter {}".
+                            format(name, target_name))
+        return self._accumulators[name][target_name]
+
+    def _add_moments_pows(self, p):
+        acc_dtype = p.dtype
+        if acc_dtype == core.VarDesc.VarType.FP16:
+            acc_dtype = core.VarDesc.VarType.FP32
+        self._add_accumulator(self._moment1_acc_str, p, dtype=acc_dtype)
+        self._add_accumulator(self._moment2_acc_str, p, dtype=acc_dtype)
+        self._add_accumulator(
+            name=self._beta1_pow_acc_str,
+            param=p,
+            dtype=acc_dtype,
+            fill_value=0.9 if isinstance(self._beta1, Variable) \
+                    else self._beta1,
+            shape=[1],
+            type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
+        self._add_accumulator(
+            name=self._beta2_pow_acc_str,
+            param=p,
+            dtype=acc_dtype,
+            fill_value=0.999 if isinstance(self._beta2, Variable) \
+                    else self._beta2,
+            shape=[1],
+            type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
+
+    def _create_accumulators(self, block, parameters):
+        assert isinstance(block, framework.Block)
+        if isinstance(parameters, dict):
+            parameters = self._update_param_group(parameters)
+
+        # Create accumulator tensors for first and second moments
+        for p in parameters:
+            if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16:
+                master_p = self._create_master_weight(p)
+                self._add_moments_pows(master_p)
+                continue
+            if p.dtype == core.VarDesc.VarType.FP16 and not self._multi_precision:
+                warnings.warn(
+                    "Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
+                    "Consider using multi_precision=True option of the Adam optimizer."
+                )
+            self._add_moments_pows(p)
 
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
@@ -295,8 +447,9 @@ def _append_optimize_op(self, block, param_and_grad):
                 _, _, _, _, _, _ = _C_ops.final_state_adamw(
                     param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
                     beta1_pow_acc, beta2_pow_acc, master_weight, found_inf,
-                    _beta1, _beta2, self._epsilon, lr_ratio_, self._coeff,
-                    with_decay, self._lazy_mode, 1000, find_master, False)
+                    _beta1, _beta2, self._epsilon, lr_ratio_,
+                    self._weight_decay, with_decay, self._lazy_mode, 1000,
+                    find_master, False)
             else:
                 _, _, _, _, _, _ = _C_ops.adamw(
                     param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
@@ -306,8 +459,8 @@ def _append_optimize_op(self, block, param_and_grad):
                     'lazy_mode', self._lazy_mode,
                     'min_row_size_to_use_multithread', 1000, 'beta1', _beta1,
                     'beta2', _beta2, "with_decay", with_decay, 'coeff',
-                    self._coeff, 'multi_precision', find_master, 'lr_ratio',
-                    lr_ratio_)
+                    self._weight_decay, 'multi_precision', find_master,
+                    'lr_ratio', lr_ratio_)
             return None
 
         inputs = {
@@ -338,7 +491,7 @@ def _append_optimize_op(self, block, param_and_grad):
             "min_row_size_to_use_multithread": 1000,
             "multi_precision": find_master,
             "with_decay": with_decay,
-            "coeff": self._coeff,
+            "coeff": self._weight_decay,
             "lr_ratio": 1.
             if self._lr_ratio is None else self._lr_ratio(param_and_grad[0])
         }
@@ -369,17 +522,96 @@ def _append_optimize_op(self, block, param_and_grad):
 
         return adamw_op
 
-    def _create_optimization_pass(self, parameters_and_grads):
-        optimize_ops = super(
-            AdamW, self)._create_optimization_pass(parameters_and_grads)
-        # In dygraph mode, clear _lr_to_coeff after applied gradient
-        self._lr_to_coeff = dict()
-        return optimize_ops
-
     def __str__(self):
         return " ".join(["Weight Decay, params:", ",".join(self._params_name)])
 
+    @imperative_base.no_grad
+    @framework.dygraph_only
+    def step(self):
+        """
+        Execute the optimizer and update parameters once.
+
+        Returns:
+            None
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                
+                a = paddle.rand([2,13], dtype="float32")
+                linear = paddle.nn.Linear(13, 5)
+                # This can be any optimizer supported by dygraph.
+                opt = paddle.optimizer.AdamW(learning_rate = 0.01,
+                                            parameters = linear.parameters())
+                out = linear(a)
+                out.backward()
+                opt.step()
+                opt.clear_grad()
+        """
+        if not isinstance(self._parameter_list[0], dict):
+            params_grads = []
+            for param in self._parameter_list:
+                if param.stop_gradient:
+                    continue
+                if param._grad_ivar() is not None:
+                    grad_var = param._grad_ivar()
+                    if framework.in_dygraph_mode():
+                        if hasattr(grad_var, "is_selected_rows"
+                                   ) and grad_var.is_selected_rows(
+                                   ) and self.regularization is not None:
+                            raise RuntimeError(
+                                "AdamW don't support weight_decay with sparse parameters, please set it to None."
+                            )
+                    else:
+                        if hasattr(grad_var,
+                                   "_is_sparse") and grad_var._is_sparse(
+                                   ) and self.regularization is not None:
+                            raise RuntimeError(
+                                "AdamW don't support weight_decay with sparse parameters, please set it to None."
+                            )
+                    params_grads.append((param, grad_var))
+
+            optimize_ops = self._apply_optimize(
+                loss=None, startup_program=None, params_grads=params_grads)
+        else:
+            # optimize parameters in groups
+            for param_group in self._param_groups:
+                params_grads = defaultdict(lambda: list())
+                for param in param_group['params']:
+                    if param.stop_gradient:
+                        continue
+                    if param._grad_ivar() is not None:
+                        grad_var = param._grad_ivar()
+                        if framework.in_dygraph_mode():
+                            if hasattr(grad_var, "is_selected_rows"
+                                       ) and grad_var.is_selected_rows(
+                                       ) and self.regularization is not None:
+                                raise RuntimeError(
+                                    "AdamW don't support weight_decay with sparse parameters, please set it to None."
+                                )
+                        else:
+                            if hasattr(grad_var,
+                                       "_is_sparse") and grad_var._is_sparse(
+                                       ) and self.regularization is not None:
+                                raise RuntimeError(
+                                    "AdamW don't support weight_decay with sparse parameters, please set it to None."
+                                )
+                        params_grads['params'].append((param, grad_var))
+                params_grads.update(
+                    {k: v
+                     for k, v in param_group.items() if k != 'params'})
+                self._apply_optimize(
+                    loss=None, startup_program=None, params_grads=params_grads)
+
     def _update_param_group(self, parameters):
-        self._coeff = parameters.get('coeff', self._default_dict['coeff'])
+        self._beta1 = parameters.get('beta1', self._default_dict['beta1'])
+        self._beta2 = parameters.get('beta2', self._default_dict['beta2'])
+        self._epsilon = parameters.get('epsilon', self._default_dict['epsilon'])
+        self._lazy_mode = parameters.get('lazy_mode',
+                                         self._default_dict['lazy_mode'])
+        self._weight_decay = parameters.get('weight_decay',
+                                            self._default_dict['weight_decay'])
         parameters = parameters.get('params')
+
         return parameters
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index 9dfec3947e95f..cf180fccc4857 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -58,6 +58,8 @@ def append_backward_new(loss_list,
     program = default_main_program()
     assert program.num_blocks == 1, "The append_backward_new interface is designed to process only one block."
     block = program.current_block()
+    for el in loss_list:
+        assert el.block == block, f'variable in loss_list should be in current block of main program'
 
     orig2prim(block)
     ad = Transform(block)
diff --git a/python/paddle/static/sparsity/__init__.py b/python/paddle/static/sparsity/__init__.py
index 59f794ef28aa4..b4543b8d000fc 100644
--- a/python/paddle/static/sparsity/__init__.py
+++ b/python/paddle/static/sparsity/__init__.py
@@ -16,8 +16,14 @@
 from ...fluid.contrib.sparsity import calculate_density  #noqa: F401
 from ...fluid.contrib.sparsity import decorate  #noqa: F401
 from ...fluid.contrib.sparsity import prune_model  #noqa: F401
-from ...fluid.contrib.sparsity import set_excluded_layers  #noqa: F401
 from ...fluid.contrib.sparsity import reset_excluded_layers  #noqa: F401
+from ...fluid.contrib import sparsity  #noqa: F401
+
+
+def set_excluded_layers(main_program, param_names):
+    sparsity.set_excluded_layers(
+        param_names=param_names, main_program=main_program)
+
 
 __all__ = [     #noqa
     'calculate_density',
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 283bce1cc817f..478f4b6351fbf 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -263,6 +263,7 @@
 from .stat import var  # noqa: F401
 from .stat import numel  # noqa: F401
 from .stat import median  # noqa: F401
+from .stat import nanmedian  # noqa: F401
 from .stat import quantile  # noqa: F401
 from .stat import nanquantile  # noqa: F401
 
@@ -448,6 +449,7 @@
            'var',
            'numel',
            'median',
+           'nanmedian',
            'quantile',
            'nanquantile',
            'is_complex',
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index c7e73cec47bea..e37ca981f851c 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -1510,12 +1510,14 @@ def assign(x, output=None):
     # isinstance(VarBase, Variable) == False. It will cause return None
     # after this api.
     if isinstance(input, (Variable, core.VarBase)):
-        if _non_static_mode():
+        if in_dygraph_mode():
+            if output is None:
+                output = _C_ops.final_state_assign(input)
+            else:
+                _C_ops.final_state_assign_out_(input, output)
+        elif _in_legacy_dygraph():
             if output is None:
-                if _in_legacy_dygraph():
-                    output = core.VarBase()
-                else:
-                    output = core.eager.Tensor()
+                output = core.VarBase()
             _C_ops.assign(input, output)
         else:
             check_dtype(input.dtype, 'input', [
@@ -1566,16 +1568,21 @@ def assign(x, output=None):
         if output is None:
             output = helper.create_variable_for_type_inference(
                 dtype=input.dtype)
-        helper.append_op(
-            type='assign_value',
-            outputs={'Out': [output]},
-            attrs={
-                'dtype': dtype,
-                'shape': list(input.shape),
-                value_name: values
-            })
-
-    if is_inplace and _non_static_mode():
+        if _non_static_mode():
+            _C_ops.assign_value(output, 'shape',
+                                list(input.shape), 'dtype', dtype, value_name,
+                                values)
+        else:
+            helper.append_op(
+                type='assign_value',
+                outputs={'Out': [output]},
+                attrs={
+                    'dtype': dtype,
+                    'shape': list(input.shape),
+                    value_name: values
+                })
+
+    if is_inplace and _in_legacy_dygraph():
         output._bump_inplace_version()
 
     return output
diff --git a/python/paddle/tensor/einsum.py b/python/paddle/tensor/einsum.py
index 713a611f9f39a..49cc426a00fd9 100644
--- a/python/paddle/tensor/einsum.py
+++ b/python/paddle/tensor/einsum.py
@@ -798,11 +798,12 @@ def gen_einsum_op(equation, *operands):
     """
     assert len(operands) <= 2, "Only support two operands in EinsumOp."
     if in_dygraph_mode():
-        return _C_ops.final_state_einsum(operands, equation)
+        return _C_ops.final_state_einsum(operands, equation)[0]
 
     if _in_legacy_dygraph():
         # dygraph
-        return _C_ops.einsum(operands, 'equation', equation)
+        return _C_ops.einsum(operands, len(operands), 'equation', equation)[0]
+
     # static graph 
     for inp in operands:
         check_variable_and_dtype(inp, 'dtype', ['float32', 'float64'], 'einsum')
@@ -811,11 +812,16 @@ def gen_einsum_op(equation, *operands):
     out = helper.create_variable_for_type_inference(dtype=operands[0].dtype)
     attrs = dict()
     attrs['equation'] = equation
+    caches = [
+        helper.create_variable_for_type_inference(dtype=operands[0].dtype)
+        for i in range(len(operands))
+    ]
     helper.append_op(
         type='einsum',
         inputs={'Operands': operands},
-        outputs={'Out': out},
-        attrs=attrs, )
+        outputs={'Out': out,
+                 "InnerCache": caches},
+        attrs=attrs)
     return out
 
 
@@ -977,7 +983,7 @@ def einsum(equation, *operands):
         #     [0.51476848, 0.23367381, 0.39229113]]])
     """
     import os
-    if int(os.environ.get('FLAGS_new_einsum', "0")):
+    if int(os.environ.get('FLAGS_new_einsum', "1")):
         return einsum_v2(equation, *operands)
 
     nop = len(operands)
diff --git a/python/paddle/tensor/layer_function_generator.py b/python/paddle/tensor/layer_function_generator.py
index ecb13613a125e..72e5eb640125d 100644
--- a/python/paddle/tensor/layer_function_generator.py
+++ b/python/paddle/tensor/layer_function_generator.py
@@ -21,7 +21,7 @@
 from six.moves import cStringIO
 from ..static import Variable
 from ..fluid.proto import framework_pb2
-from ..framework import OpProtoHolder, core, convert_np_dtype_to_dtype_
+from ..framework import OpProtoHolder, core, convert_np_dtype_to_dtype_, _non_static_mode, in_dygraph_mode, _in_legacy_dygraph
 from ..framework import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype
 import paddle
@@ -256,7 +256,13 @@ def generate_activation_fn(op_type):
     op_proto = OpProtoHolder.instance().get_op_proto(op_type)
 
     def func(x, name=None):
-        if paddle.in_dynamic_mode():
+        final_state_op_type = "final_state_%s" % op_type
+        if in_dygraph_mode() and hasattr(_C_ops, final_state_op_type):
+            op = getattr(_C_ops, final_state_op_type)
+            return op(x)
+        # TODO(dev): Because some ops' yaml has not been migrated.
+        # Replace it with _in_legacy_dygraph while all yaml work is done.
+        if _non_static_mode():
             op = getattr(_C_ops, op_type)
             return op(x)
 
@@ -265,9 +271,10 @@ def func(x, name=None):
                                      op_type)
         else:
             # abs exp square ops support dtype(int32, int64, float16, float32, float64)
-            check_variable_and_dtype(
-                x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'],
-                op_type)
+            check_variable_and_dtype(x, 'x', [
+                'int32', 'int64', 'float16', 'float32', 'float64', 'complex64',
+                'complex128'
+            ], op_type)
 
         helper = LayerHelper(op_type, **locals())
 
@@ -296,7 +303,7 @@ def generate_inplace_fn(inplace_op_type):
     origin_op_type = inplace_op_type[:-1]
 
     def func(x, name=None):
-        if paddle.in_dynamic_mode():
+        if _non_static_mode():
             op = getattr(_C_ops, inplace_op_type)
             return op(x)
         warnings.warn(
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 973f870d581cd..57785c16e60bb 100755
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -3169,7 +3169,7 @@ def reshape(x, shape, name=None):
                 item.numpy().item(0) if isinstance(item, Variable) else item
                 for item in shape
             ]
-            out, _ = _C_ops.reshape2(x, None, 'shape', shape)
+            out = _C_ops.final_state_reshape(x, shape)
         elif isinstance(shape, tmp_tensor_type):
             shape.stop_gradient = True
             out, _ = _C_ops.reshape2(x, shape)
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 0be79ece01ff9..2ef324395b26a 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -1610,20 +1610,24 @@ def addmm(input, x, y, beta=1.0, alpha=1.0, name=None):
     input_shape = input.shape
     x_shape = x.shape
     y_shape = y.shape
-    if not len(input_shape) == len(x_shape) == len(y_shape) == 2:
-        raise ValueError("The dimention of input, x, y should be 2 but receive input's shape: {}, x's shape: {}, y's shape: {}".format(input_shape, x_shape, y_shape))
-    if input_shape[0] != x_shape[0]:
-        if input_shape[0] != 1:
-            raise ValueError( "When x's dimension[0] is not equal with input's dimension[0], input's dimension[0] must be 1 but got {}".format(input_shape[0]))
-        if input_shape[1] != y_shape[1] and input_shape[1] != 1:
-            raise ValueError( "When y's dimension[1] is not equal with input's dimension[1], input's dimension[1] must be 1 but got {}".format(input_shape[1]))
-    if input_shape[1] != y_shape[1]:
-        if input_shape[1] != 1:
-            raise ValueError( "When y's dimension[1] is not equal with input's dimension[1], input's dimension[1] must be 1 but got {}".format(input_shape[1]))
-        if input_shape[0] != x_shape[0] and input_shape[0] != 1:
-            raise ValueError( "When x's dimension[0] is not equal with input's dimension[0], input's dimension[0] must be 1 but got {}".format(input_shape[0]))
+    if not len(x_shape) == len(y_shape) == 2:
+        raise ValueError("The dimention of x, y should be 2 but receive x's shape: {}, y's shape: {}".format(x_shape, y_shape))
     if x_shape[1] != y_shape[0]:
         raise ValueError("The input Variable x's width must be equal with Variable y' height. But received x's shape = {}, y's shape = {}.".format(x_shape, y_shape))
+    if len(input_shape) == 2:
+        if input_shape[0] != x_shape[0]:
+            if input_shape[0] != 1:
+                raise ValueError( "When x's dimension[0] is not equal with input's dimension[0], input's dimension[0] must be 1 but got {}".format(input_shape[0]))
+            if input_shape[1] != y_shape[1] and input_shape[1] != 1:
+                raise ValueError( "When y's dimension[1] is not equal with input's dimension[1], input's dimension[1] must be 1 but got {}".format(input_shape[1]))
+        if input_shape[1] != y_shape[1]:
+            if input_shape[1] != 1:
+                raise ValueError( "When y's dimension[1] is not equal with input's dimension[1], input's dimension[1] must be 1 but got {}".format(input_shape[1]))
+    elif len(input_shape) == 1:
+        if input_shape[0] not in (y_shape[1], 1):
+            raise ValueError("The input's shape: {} is not broadcastable with [x.shape[0], y.shape[1]]: [{},{}]".format(input_shape, x_shape[0], y_shape[1]))
+    else:
+        raise ValueError("The dimention of input should be 2 or 1 but receive input's shape: {}".format(input_shape))
 
 
 
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index 1194d81a360db..49671d65b6d44 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -30,7 +30,7 @@
 def bernoulli(x, name=None):
     """
 
-    This OP returns a Tensor filled with random binary(0 or 1) number from a Bernoulli distribution.
+    Returns a Tensor filled with random binary(0 or 1) number from a Bernoulli distribution.
     The input ``x`` is a tensor with probabilities for generating the random binary number.
     Each element in ``x`` should be in [0, 1], and the out is generated by:
     
@@ -86,7 +86,7 @@ def bernoulli(x, name=None):
 
 def poisson(x, name=None):
     r"""
-    This OP returns a tensor filled with random number from a Poisson Distribution.
+    Returns a tensor filled with random number from a Poisson Distribution.
 
     .. math::
 
@@ -129,7 +129,7 @@ def poisson(x, name=None):
 
 def multinomial(x, num_samples=1, replacement=False, name=None):
     """
-    This OP returns a Tensor filled with random values sampled from a Multinomical
+    Returns a Tensor filled with random values sampled from a Multinomical
     distribution. The input ``x`` is a tensor with probabilities for generating the
     random number. Each element in ``x`` should be larger or equal to 0, but not all
     0. ``replacement`` indicates whether it is a replaceable sample. If ``replacement``
@@ -278,7 +278,7 @@ def gaussian(shape, mean=0.0, std=1.0, dtype=None, name=None):
 
 def standard_normal(shape, dtype=None, name=None):
     """
-    This OP returns a Tensor filled with random values sampled from a standard
+    Returns a Tensor filled with random values sampled from a standard
     normal distribution with mean 0 and standard deviation 1, with ``shape``
     and ``dtype``.
 
@@ -387,7 +387,7 @@ def randn(shape, dtype=None, name=None):
 
 def normal(mean=0.0, std=1.0, shape=None, name=None):
     """
-    This OP returns a Tensor filled with random values sampled from a normal
+    Returns a Tensor filled with random values sampled from a normal
     distribution with ``mean`` and ``std`` (standard deviation) .
 
     If ``mean`` is a Tensor, the output Tensor has the same shape and data type as ``mean``.
@@ -475,7 +475,7 @@ def normal(mean=0.0, std=1.0, shape=None, name=None):
 
 def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
     """
-    This OP returns a Tensor filled with random values sampled from a uniform
+    Returns a Tensor filled with random values sampled from a uniform
     distribution in the range [``min``, ``max``), with ``shape`` and ``dtype``.
 
     Examples:
@@ -505,20 +505,16 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
             it will use the seed of the global default generator (which can be set by paddle.seed). 
             Note that if seed is not 0, this operator will always generate the same random numbers every
             time. Default is 0.
-        name(str, optional): The default value is None. Normally there is no
-            need for user to set this property. For more information, please
-            refer to :ref:`api_guide_Name`.
+        name(str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor: A Tensor filled with random values sampled from a uniform
         distribution in the range [``min``, ``max``), with ``shape`` and ``dtype``.
 
-    Raises:
-        TypeError: If ``shape`` is not list, tuple, Tensor.
-        TypeError: If ``dtype`` is not float32, float64.
-
     Examples:
         .. code-block:: python
+          :name: code-example1
             
             import paddle
 
@@ -625,7 +621,7 @@ def uniform_(x, min=-1.0, max=1.0, seed=0, name=None):
 
 def randint(low=0, high=None, shape=[1], dtype=None, name=None):
     """
-    This OP returns a Tensor filled with random integers from a discrete uniform
+    Returns a Tensor filled with random integers from a discrete uniform
     distribution in the range [``low``, ``high``), with ``shape`` and ``dtype``.
     If ``high`` is None (the default), the range is [0, ``low``).
 
@@ -731,7 +727,7 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
 
 def randint_like(x, low=0, high=None, dtype=None, name=None):
     """
-    This OP returns a Tensor filled with random integers from a discrete uniform
+    Returns a Tensor filled with random integers from a discrete uniform
     distribution in the range [``low``, ``high``), with the same shape as ``x``.
     (use ``dtype`` if ``dtype`` is not None) 
     If ``high`` is None (the default), the range is [0, ``low``).
@@ -957,7 +953,7 @@ def randperm(n, dtype="int64", name=None):
 
 def rand(shape, dtype=None, name=None):
     """
-    This OP returns a Tensor filled with random values sampled from a uniform
+    Returns a Tensor filled with random values sampled from a uniform
     distribution in the range [0, 1), with ``shape`` and ``dtype``.
 
     Args:
diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py
index 52ccc60100996..372454b97a6be 100644
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -241,6 +241,103 @@ def numel(x, name=None):
     return out
 
 
+def nanmedian(x, axis=None, keepdim=True, name=None):
+    r"""
+    Compute the median along the specified axis, while ignoring NaNs.
+
+    If the valid count of elements is a even number,
+    the average value of both elements in the middle is calculated as the median.
+
+    Args:
+        x (Tensor): The input Tensor, it's data type can be int32, int64, float16, float32, float64.
+        axis (None|int|list|tuple, optional):
+            The axis along which to perform median calculations ``axis`` should be int or list of int.
+            ``axis`` should be in range [-D, D), where D is the dimensions of ``x`` .
+            If ``axis`` is less than 0, it works the same way as :math:`axis + D`.
+            If ``axis`` is None, median is calculated over all elements of ``x``. Default is None.
+        keepdim (bool, optional): Whether to reserve the reduced dimension(s)
+            in the output Tensor. If ``keepdim`` is True, the dimensions of
+            the output Tensor is the same as ``x`` except in the reduced
+            dimensions(it is of size 1 in this case). Otherwise, the shape of
+            the output Tensor is squeezed in ``axis`` . Default is True.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor, results of median along ``axis`` of ``x``. The output dtype is the same as `x`.
+
+    Examples:
+        .. code-block:: python
+            :name: nanmedian-example
+
+            import paddle
+            x = paddle.to_tensor([[float('nan'), 2. , 3. ], [0. , 1. , 2. ]])
+
+            y1 = x.nanmedian()
+            # y1 is [[2.]]
+
+            y2 = x.nanmedian(0)
+            # y2 is [[0.,  1.5, 2.5]]
+
+            y3 = x.nanmedian(0, keepdim=False)
+            # y3 is [0.,  1.5, 2.5]
+
+            y4 = x.nanmedian((0, 1))
+            # y4 is [[2.]]
+    """
+    if not isinstance(x, Variable):
+        raise TypeError("In median, the input x should be a Tensor.")
+
+    if isinstance(axis, (list, tuple)) and len(axis) == 0:
+        raise ValueError("Axis list should not be empty.")
+
+    dims = len(x.shape)
+    if axis is None:
+        axis = []
+    elif isinstance(axis, tuple):
+        axis = list(axis)
+    elif isinstance(axis, int):
+        axis = [axis]
+
+    if not isinstance(axis, list):
+        raise ValueError(
+            "Axis should be None, int, or a list, element should in range [-rank(x), rank(x))."
+        )
+
+    for i in range(len(axis)):
+        if not isinstance(axis[i], int) or not (axis[i] < dims and
+                                                axis[i] >= -dims):
+            raise ValueError(
+                "Axis should be None, int, or a list, element should in range [-rank(x), rank(x))."
+            )
+        if axis[i] < 0:
+            axis[i] += dims
+
+    if len(axis) != len(set(axis)):
+        raise ValueError("Axis has duplicated elements.")
+
+    if _in_legacy_dygraph():
+        median_index, out = _C_ops.nanmedian(x, 'axis', axis, 'keepdim',
+                                             keepdim)
+        return out
+
+    check_variable_and_dtype(
+        x, 'X', ['int32', 'int64', 'float16', 'float32', 'float64'],
+        'nanmedian')
+
+    helper = LayerHelper('nanmedian', **locals())
+    attrs = {'axis': axis, 'keepdim': keepdim}
+    out = helper.create_variable_for_type_inference(x.dtype)
+    medians = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='nanmedian',
+        inputs={'X': x},
+        outputs={'Out': out,
+                 'MedianIndex': medians},
+        attrs=attrs)
+    return out
+
+
 def median(x, axis=None, keepdim=False, name=None):
     """
     Compute the median along the specified axis.
diff --git a/python/paddle/tests/test_async_read_write.py b/python/paddle/tests/test_async_read_write.py
index babdf43199dd6..1432063421586 100644
--- a/python/paddle/tests/test_async_read_write.py
+++ b/python/paddle/tests/test_async_read_write.py
@@ -96,7 +96,9 @@ def test_main(self):
         with _test_eager_guard():
             self.func_setUp()
             self.func_test_async_read_empty_offset_and_count()
+            self.func_setUp()
             self.func_test_async_read_success()
+            self.func_setUp()
             self.func_test_async_read_only_1dim()
         self.func_setUp()
         self.func_test_async_read_empty_offset_and_count()
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index 54a5100c892fc..44865940adb44 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -189,6 +189,18 @@
     func : assign
   backward : assign_grad
 
+- api : assign_out_
+  args : (Tensor x, Tensor output)
+  output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : assign
+    param : [x]
+  inplace : (output -> out)
+  backward : assign_out__grad
+
 # atan
 - api : atan
   args : (Tensor x)
@@ -319,6 +331,16 @@
     func : ceil
   backward : ceil_grad
 
+- api : celu
+  args : (Tensor x, float alpha)
+  output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
+    param: [x]
+  kernel :
+    func : celu
+  backward : celu_grad
+
 # cholesky
 - api : cholesky
   args : (Tensor x, bool upper)
@@ -385,6 +407,12 @@
     use_gpudnn : true
   backward : conv2d_transpose_grad
 
+- api : conv3d
+  args : (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search)
+  output : Tensor
+  invoke : conv3d_impl(input, filter, strides, paddings, paddding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search)
+  backward : conv3d_grad
+
 - api : conv3d_transpose
   args : (Tensor x, Tensor filter, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format)
   output : Tensor(out)
@@ -470,6 +498,17 @@
   optional : mask
   backward : deformable_conv_grad
 
+- api : depthwise_conv2d
+  args : (Tensor x, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu)
+  output : Tensor(out)
+  invoke : conv2d_impl(x, filter, strides, paddings, padding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search)
+  backward : depthwise_conv2d_grad
+  # infer_meta :
+  #   func : ConvTransposeInferMeta
+  #   prams: [x, filter, strides, paddings, padding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search]
+  # kernel :
+  #   func : depthwise_conv2d
+
 - api : depthwise_conv2d_transpose
   args : (Tensor x, Tensor filter, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format)
   output : Tensor(out)
@@ -563,7 +602,7 @@
 
 - api : einsum
   args : (Tensor[] x, str equation)
-  output : Tensor
+  output : Tensor, Tensor[]{x.size()}
   infer_meta :
     func : EinsumInferMeta
     param : [x, equation]
@@ -591,6 +630,12 @@
     func : elu
   backward : elu_grad
 
+- api : embedding
+  args : (Tensor x, Tensor weight, int64_t padding_idx=-1, bool sparse=false)
+  output : Tensor
+  invoke : embedding_impl(x, weight, padding_idx, sparse)
+  backward : embedding_grad
+
 - api : empty
   args : (IntArray shape, DataType dtype=DataType::FLOAT32, Place place=CPUPlace())
   output: Tensor
@@ -1534,7 +1579,7 @@
     func : PadInferMeta
   kernel :
     func : pad
-  # backward : pad_grad
+  backward : pad_grad
 
 - api : pad3d
   args : (Tensor x, IntArray paddings, str mode,  float pad_value, str data_format)
diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py
index 96896b65f4041..1f19dec992d2f 100644
--- a/python/paddle/utils/code_gen/api_base.py
+++ b/python/paddle/utils/code_gen/api_base.py
@@ -48,8 +48,7 @@ def __init__(self, api_item_yaml):
                 'func']) == 1 or not self.kernel['func'][1].endswith(
                     '_sr') else True
             self.data_transform = self.parse_data_transform(api_item_yaml)
-            self.inplace_map, self.view_map = self.parse_inplace_and_view(
-                api_item_yaml)
+            self.inplace_map, self.view_map = {}, {}
 
     def get_api_name(self, api_item_yaml):
         return api_item_yaml['api']
@@ -141,7 +140,7 @@ def parse_input_and_attr(self, api_name, args_config, optional_vars=[]):
             'int[]': 'const std::vector<int>&'
         }
         optional_types_trans = {
-            'Tensor': 'paddle::optional<const Tensor&>',
+            'Tensor': 'const paddle::optional<Tensor>&',
             'Tensor[]': 'const paddle::optional<std::vector<Tensor>>&',
             'int': 'paddle::optional<int>',
             'int32_t': 'paddle::optional<int32_t>',
@@ -224,16 +223,18 @@ def parse_output_item(output_item):
 
         if len(temp_list) == 1:
             out_type, out_name, size_expr = parse_output_item(temp_list[0])
-            return [out_type], [out_name], size_expr
+            return [out_type], [out_name], [size_expr]
         else:
             out_type_list = []
             out_name_list = []
+            out_size_expr_list = []
             for output_item in temp_list:
                 out_type, out_name, size_expr = parse_output_item(output_item)
                 out_type_list.append(out_type)
                 out_name_list.append(out_name)
+                out_size_expr_list.append(size_expr)
 
-            return out_type_list, out_name_list, size_expr
+            return out_type_list, out_name_list, out_size_expr_list
 
     def parse_infer_meta(self, infer_meta_config):
         infer_meta = infer_meta_config
@@ -301,31 +302,6 @@ def parse_data_transform(self, api_item_yaml):
 
         return data_transform
 
-    def parse_inplace_and_view(self, api_item_yaml):
-        inplace_map, view_map = {}, {}
-        for mode in ['inplace', 'view']:
-            if mode in api_item_yaml:
-                if mode == 'inplace':
-                    inplace_map = {}
-                else:
-                    view_map = {}
-                in_out_mapping_list = api_item_yaml[mode].split(',')
-                for item in in_out_mapping_list:
-                    result = re.search(r"(?P<in>\w+)\s*->\s(?P<out>\w+)", item)
-                    in_val = result.group('in')
-                    out_val = result.group('out')
-                    assert in_val in self.inputs['names'], \
-                        f"{self.api} : {mode} input error: the input var name('{in_val}') is not found in the input args of {self.api}."
-                    assert out_val in self.outputs['names'], \
-                        f"{self.api} : {mode} output error: the output var name('{out_val}') is not found in the output args of {self.api}."
-
-                    if mode == 'inplace':
-                        inplace_map[out_val] = in_val
-                    else:
-                        view_map[out_val] = in_val
-
-        return inplace_map, view_map
-
     # Override by child class
     def get_return_type(self, inplace_flag=False):
         return None
@@ -510,18 +486,7 @@ def gene_infer_meta(self, kernel_output_names, code_indent) -> str:
 
                     param_code = param_code + param + "_metas, "
                 elif param in self.optional_vars:
-                    meta_tensor_code = meta_tensor_code + f"""
-{code_indent}  paddle::optional<const phi::MetaTensor&> {PREFIX_TENSOR_NAME}meta_ref_{param} = paddle::none;
-{code_indent}  phi::DenseTensor {param}_dt;
-{code_indent}  phi::MetaTensor {PREFIX_TENSOR_NAME}meta_tmp_{param}({param}_dt);
-{code_indent}  if ({PREFIX_TENSOR_NAME}{param}_ptr) {{
-{code_indent}    {PREFIX_TENSOR_NAME}meta_tmp_{param}.set_dtype( {PREFIX_TENSOR_NAME}{param}_ptr->dtype() );
-{code_indent}    {PREFIX_TENSOR_NAME}meta_tmp_{param}.set_dims( {PREFIX_TENSOR_NAME}{param}_ptr->dims() );
-{code_indent}    {PREFIX_TENSOR_NAME}meta_tmp_{param}.set_layout( {PREFIX_TENSOR_NAME}{param}_ptr->layout() );
-{code_indent}    {PREFIX_TENSOR_NAME}meta_ref_{param} =  {PREFIX_TENSOR_NAME}meta_tmp_{param};
-{code_indent}  }}\n"""
-
-                    param_code = param_code + f"{PREFIX_TENSOR_NAME}meta_ref_{param}, "
+                    param_code = param_code + "MakeMetaTensor(" + PREFIX_TENSOR_NAME + param + "), "
                 else:
                     raise ValueError(
                         f"{self.api} : Param of infer_meta error : {self.inputs['input_info'][param]} type is not supported."
@@ -566,8 +531,8 @@ def get_kernel_args(self, code_indent):
             'const std::vector<const phi::DenseTensor*>&',
             'const paddle::optional<Tensor&>':
             'paddle::optional<const phi::DenseTensor&>',
-            'paddle::optional<const Tensor&>':
-            'paddle::optional<const phi::DenseTensor&>',
+            'const paddle::optional<Tensor>&':
+            'const paddle::optional<phi::DenseTensor>&',
             'const paddle::optional<std::vector<Tensor>>&':
             'paddle::optional<const std::vector<phi::DenseTensor>&>'
         }
@@ -595,11 +560,7 @@ def get_kernel_args(self, code_indent):
                     trans_flag = "{false, true}"
                 if input_name in self.optional_vars:
                     input_tensor_code = input_tensor_code + f"""
-{code_indent}  {input_trans_map[input_infos[input_name]]} {PREFIX_TENSOR_NAME}{input_name}(paddle::none);
-{code_indent}  auto {PREFIX_TENSOR_NAME}{input_name}_ptr = PrepareData({input_name}, kernel.InputAt({i}), {trans_flag});
-{code_indent}  if ({PREFIX_TENSOR_NAME}{input_name}_ptr) {{
-{code_indent}    {PREFIX_TENSOR_NAME}{input_name} = paddle::make_optional<const phi::DenseTensor&>(*{PREFIX_TENSOR_NAME}{input_name}_ptr);
-{code_indent}  }}"""
+{code_indent}  auto {PREFIX_TENSOR_NAME}{input_name} = PrepareData({input_name}, kernel.InputAt({i}), {trans_flag});"""
 
                 else:
                     if self.inputs['input_info'][input_name] == "const Tensor&":
@@ -675,7 +636,7 @@ def get_selected_rows_kernel_args(self, code_indent):
         input_trans_map = {
             'const Tensor&': 'const phi::SelectedRows&',
             'const paddle::optional<Tensor>&':
-            'paddle::optional<const phi::SelectedRows&>'
+            'const paddle::optional<phi::SelectedRows>&'
         }
         out_trans_map = {'Tensor': 'phi::SelectedRows*'}
         input_names = self.inputs['names']
@@ -840,6 +801,8 @@ def gene_api_code(self):
         if self.is_base_api:
             api_code = self.gene_base_api_code()
             if len(self.inplace_map) > 0:
+                if self.api[-1] == '_':
+                    api_code = ""
                 api_code = api_code + self.gene_base_api_code(inplace_flag=True)
             return api_code
 
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index 0de60c14d3a42..1721da19295d5 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -30,6 +30,8 @@ def __init__(self, api_item_yaml):
         super(ForwardAPI, self).__init__(api_item_yaml)
         self.is_dygraph_api, self.intermediate_outs = self.parse_intermediate(
             api_item_yaml)
+        self.inplace_map, self.view_map = self.parse_inplace_and_view(
+            api_item_yaml)
 
     def get_api_func_name(self):
         if self.is_dygraph_api:
@@ -47,6 +49,31 @@ def parse_intermediate(self, api_item_yaml):
         else:
             return False, []
 
+    def parse_inplace_and_view(self, api_item_yaml):
+        inplace_map, view_map = {}, {}
+        for mode in ['inplace', 'view']:
+            if mode in api_item_yaml:
+                if mode == 'inplace':
+                    inplace_map = {}
+                else:
+                    view_map = {}
+                in_out_mapping_list = api_item_yaml[mode].split(',')
+                for item in in_out_mapping_list:
+                    result = re.search(r"(?P<in>\w+)\s*->\s*(?P<out>\w+)", item)
+                    in_val = result.group('in')
+                    out_val = result.group('out')
+                    assert in_val in self.inputs['names'], \
+                        f"{self.api} : {mode} input error: the input var name('{in_val}') is not found in the input args of {self.api}."
+                    assert out_val in self.outputs['names'], \
+                        f"{self.api} : {mode} output error: the output var name('{out_val}') is not found in the output args of {self.api}."
+
+                    if mode == 'inplace':
+                        inplace_map[out_val] = in_val
+                    else:
+                        view_map[out_val] = in_val
+
+        return inplace_map, view_map
+
     def get_return_type_with_intermediate(self, inplace_flag=False):
         out_type_list = []
         for i, out_type in enumerate(self.outputs['types']):
@@ -111,10 +138,10 @@ def gene_output(self,
 {code_indent}  {return_type} api_output{inplace_assign};"""
 
             if return_type == 'std::vector<Tensor>':
-                assert self.outputs['out_size_expr'] is not None, \
+                assert self.outputs['out_size_expr'][0] is not None, \
                      f"{api_name}: The out size expr : '{{expr}}' should be set when output has Tensor[]. You can refer 'split' api."
                 output_create = output_create + f"""
-{code_indent}  auto kernel_out = {set_out_func}({self.outputs['out_size_expr']}, kernel_backend, &api_output);"""
+{code_indent}  auto kernel_out = {set_out_func}({self.outputs['out_size_expr'][0]}, kernel_backend, &api_output);"""
 
             else:
                 output_create = output_create + f"""
@@ -195,7 +222,6 @@ def source_include(header_file_path):
 #include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/infermeta/binary.h"
 #include "paddle/phi/infermeta/multiary.h"
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index a720c27543c06..d6c148e6ca925 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -32,6 +32,7 @@
     param : [x]
   kernel :
     func : acos_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : acosh_grad
   forward : acosh (Tensor x) -> Tensor(out)
@@ -42,6 +43,7 @@
     param : [x]
   kernel :
     func : acosh_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : add_double_grad
   forward : add_grad (Tensor x, Tensor y, Tensor grad_out, int axis = -1) -> Tensor(grad_x), Tensor(grad_y)
@@ -54,6 +56,7 @@
     func : add_double_grad
   optional : grad_x_grad, grad_y_grad
   backward : add_triple_grad
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : add_grad
   forward : add (Tensor x, Tensor y) -> Tensor(out)
@@ -66,6 +69,7 @@
     func : add_grad
   no_need_buffer : x, y
   backward : add_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : add_n_grad
   forward : add_n (Tensor[] x) -> Tensor(out)
@@ -83,6 +87,7 @@
     param : [grad_grad_x, grad_grad_y]
   kernel :
     func : add_triple_grad
+  inplace : (grad_grad_out_grad -> grad_grad_x_grad)
 
 - backward_api : addmm_grad
   forward : addmm (Tensor input, Tensor x, Tensor y, float alpha, float beta) -> Tensor(out)
@@ -114,6 +119,7 @@
     param : [x]
   kernel :
     func : asin_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : asinh_grad
   forward : asinh (Tensor x) -> Tensor(out)
@@ -124,6 +130,7 @@
     param : [x]
   kernel :
     func : asinh_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : assign_grad
   forward : assign (Tensor x) -> Tensor(out)
@@ -131,9 +138,19 @@
   output : Tensor(x_grad)
   infer_meta :
     func : UnchangedInferMeta
-    param : [out_grad]
   kernel :
     func : assign
+  inplace : (out_grad -> x_grad)
+
+- backward_api : assign_out__grad
+  forward : assign_out_ (Tensor x, Tensor output) -> Tensor(out)
+  args : (Tensor out_grad)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+  kernel :
+    func : assign
+  inplace : (out_grad -> x_grad)
 
 - backward_api : atan2_grad
   forward : atan2 (Tensor x, Tensor y) -> Tensor(out)
@@ -154,6 +171,7 @@
     param : [x]
   kernel :
     func : atan_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : atanh_grad
   forward : atanh (Tensor x) -> Tensor(out)
@@ -164,6 +182,7 @@
     param : [x]
   kernel :
     func : atanh_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : batch_norm_double_grad
   forward : batch_norm_grad (Tensor x, Tensor scale, Tensor bias, Tensor out_mean, Tensor out_variance, Tensor saved_mean, Tensor saved_variance, Tensor reserve_space, Tensor grad_out, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) -> Tensor(grad_x), Tensor(grad_scale), Tensor(grad_bias)
@@ -176,6 +195,7 @@
     func : batch_norm_grad_grad
     data_type : x
   optional : out_mean, out_variance
+  inplace : (grad_out -> grad_out_grad)
 
 - backward_api : batch_norm_grad
   forward : batch_norm (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) -> Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space)
@@ -199,6 +219,7 @@
     param : [input]
   kernel :
     func : bce_loss_grad
+  inplace : (out_grad -> input_grad)
 
 - backward_api : brelu_grad
   forward : brelu (Tensor x, float t_min, float t_max) -> Tensor(out)
@@ -209,6 +230,7 @@
     param : [x]
   kernel :
     func : brelu_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : cast_grad
   forward : cast (Tensor x, DataType out_dtype) -> Tensor(out)
@@ -220,6 +242,7 @@
   kernel :
     func : cast_grad
     data_type : out_grad
+  no_need_buffer : x
 
 - backward_api : ceil_grad
   forward : ceil(Tensor x) -> Tensor(out)
@@ -230,6 +253,30 @@
     param: [out_grad]
   kernel :
     func : ceil_grad
+  inplace : (out_grad -> x_grad)
+
+- backward_api : celu_double_grad
+  forward : celu_grad(Tensor x, Tensor grad_out, float alpha) -> Tensor(grad_x)
+  args : (Tensor x, Tensor grad_out, Tensor grad_x_grad, float alpha)
+  output : Tensor(x_grad), Tensor(grad_out_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, x]
+  kernel :
+    func : celu_double_grad
+  inplace : (grad_x_grad -> grad_out_grad)
+
+- backward_api : celu_grad
+  forward : celu(Tensor x, float alpha) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad, float alpha)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param: [x]
+  kernel :
+    func : celu_grad
+  backward : celu_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : cholesky_grad
   forward : cholesky (Tensor x, bool upper) -> Tensor(out)
@@ -271,6 +318,7 @@
   kernel :
     func : clip_grad
   backward : clip_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : concat_double_grad
   forward : concat_grad (Tensor[] x, Tensor grad_out, Scalar axis) -> Tensor[](grad_x)
@@ -344,6 +392,25 @@
     use_gpudnn : true
   backward : conv2d_transpose_double_grad
 
+- backward_api : conv3d_grad
+  forward : conv3d (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) -> Tensor(out)
+  args : (Tensor input, Tensor filter, Tensor out_grad,  int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search)
+  output : Tensor(input_grad), Tensor(filter_grad)
+  invoke : conv3d_grad_impl(input, filter, out_grad,  strides, paddings, paddding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search, input_grad, filter_grad)
+  backward : conv3d_grad_grad
+
+- backward_api : conv3d_grad_grad
+  forward : conv3d_grad (Tensor input, Tensor filter, Tensor grad_out,  int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) -> Tensor(grad_input), Tensor(grad_filter)
+  args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search)
+  output : Tensor(input_grad), Tensor(filter_grad), Tensor(grad_out_grad)
+  infer_meta :
+    func : GeneralTernaryGradInferMeta
+    param: [input, filter, grad_out]
+  kernel :
+    func : conv3d_grad_grad
+    use_gpudnn : true
+  optional : grad_input_grad, grad_filter_grad
+
 - backward_api : conv3d_transpose_grad
   forward : conv3d_transpose(Tensor x, Tensor filter, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format) -> Tensor(out)
   args : (Tensor x, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format)
@@ -363,6 +430,7 @@
     param : [x]
   kernel :
     func : cos_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : cosh_grad
   forward : cosh (Tensor x) -> Tensor(out)
@@ -373,6 +441,7 @@
     param : [x]
   kernel :
     func : cosh_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : cross_entropy_with_softmax_grad
   forward : cross_entropy_with_softmax (Tensor input, Tensor label, bool soft_label, bool use_softmax, bool numeric_stable_mode, int ignore_index, int axis) -> Tensor(softmax), Tensor(loss)
@@ -383,6 +452,7 @@
   kernel :
     func : cross_entropy_with_softmax_grad
     data_type : softmax
+  inplace : (softmax -> input_grad)
 
 - backward_api : cross_grad
   forward : cross (Tensor x, Tensor y, int axis = 9) -> Tensor(out)
@@ -424,6 +494,25 @@
     data_type : x
   optional : mask
 
+- backward_api : depthwise_conv2d_grad
+  forward : depthwise_conv2d (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu) -> Tensor(out)
+  args : (Tensor input, Tensor filter, Tensor out_grad,  int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu)
+  output : Tensor(input_grad), Tensor(filter_grad)
+  invoke : conv2d_grad_impl(input, filter, out_grad,  strides, paddings, paddding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search, input_grad, filter_grad)
+  backward : depthwise_conv2d_grad_grad
+
+- backward_api : depthwise_conv2d_grad_grad
+  forward : depthwise_conv2d_grad (Tensor input, Tensor filter, Tensor grad_out,  int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu) -> Tensor(grad_input), Tensor(grad_filter)
+  args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search)
+  output : Tensor(input_grad), Tensor(filter_grad), Tensor(grad_out_grad)
+  infer_meta :
+    func : GeneralTernaryGradInferMeta
+    param: [input, filter, grad_out]
+  kernel :
+    func : conv2d_grad_grad
+    use_gpudnn : true
+  optional : grad_input_grad, grad_filter_grad
+
 - backward_api : depthwise_conv2d_transpose_grad
   forward : depthwise_conv2d_transpose(Tensor x, Tensor filter, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format) -> Tensor(out)
   args : (Tensor x, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format)
@@ -485,6 +574,7 @@
     func : divide_double_grad
     data_type : out
   optional : grad_x_grad, grad_y_grad
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : divide_grad
   forward : divide (Tensor x, Tensor y) -> Tensor(out)
@@ -521,8 +611,8 @@
     skip_transform : out_w, out_w_grad
 
 - backward_api : einsum_grad
-  forward : einsum (Tensor[] x, str equation) -> Tensor(out)
-  args : (Tensor[] x, Tensor out_grad, str equation)
+  forward : einsum (Tensor[] x, str equation) -> Tensor(out), Tensor[](inner_cache)
+  args : (Tensor[] x, Tensor[] inner_cache, Tensor out_grad, str equation)
   output : Tensor[](x_grad){x.size()}
   infer_meta :
     func : UnchangedMultiInferMeta
@@ -549,6 +639,7 @@
     param : [x, x]
   kernel :
     func : elu_double_grad
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : elu_grad
   forward : elu (Tensor x, float alpha) -> Tensor(out)
@@ -560,6 +651,13 @@
   kernel :
     func : elu_grad
   backward : elu_double_grad
+  inplace : (out_grad -> x_grad)
+
+- backward_api : embedding_grad
+  forward : embedding (Tensor x, Tensor weight, int64_t padding_idx=-1, bool sparse=false) -> Tensor(out)
+  args : (Tensor x, Tensor weight, Tensor out_grad, int64_t padding_idx=-1, bool sparse=false)
+  output : Tensor(weight_grad)
+  invoke : embedding_grad_impl(x, weight, out_grad, padding_idx, sparse, weight_grad)
 
 - backward_api : erf_grad
   forward : erf (Tensor x) -> Tensor(out)
@@ -591,6 +689,7 @@
     param : [out]
   kernel :
     func : exp_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : expand_as_grad
   forward : expand_as (Tensor x, Tensor y, int[] target_shape) -> Tensor(out)
@@ -633,6 +732,7 @@
     param : [out]
   kernel :
     func : expm1_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : flatten_grad
   forward : flatten(Tensor x, int start_axis, int stop_axis) -> Tensor(out), Tensor(xshape)
@@ -646,6 +746,7 @@
     data_type: out_grad
     backend: out_grad
     layout: out_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : flip_grad
   forward : flip (Tensor x, int[] axis) -> Tensor(out)
@@ -666,6 +767,7 @@
     param: [out_grad]
   kernel :
     func : floor_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : fmax_grad
   forward : fmax(Tensor x, Tensor y, int axis) -> Tensor(out)
@@ -761,6 +863,7 @@
     param : [x]
   kernel :
     func : hard_shrink_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : hard_sigmoid_grad
   forward : hard_sigmoid (Tensor x, float slope, float offset) -> Tensor(out)
@@ -771,6 +874,7 @@
     param : [out]
   kernel :
     func : hard_sigmoid_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : hard_swish_grad
   forward : hard_swish (Tensor x, float threshold = 6.0, float scale = 6.0, float offset = 3.0) -> Tensor(out)
@@ -781,6 +885,7 @@
     param : [x]
   kernel :
     func : hard_swish_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : huber_loss_grad
   forward : huber_loss (Tensor input, Tensor label, float delta) -> Tensor(out), Tensor(residual)
@@ -886,6 +991,7 @@
     param : [grad_x_grad]
   kernel :
     func : leaky_relu_double_grad
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : leaky_relu_grad
   forward : leaky_relu (Tensor x, float alpha) -> Tensor(out)
@@ -897,6 +1003,7 @@
   kernel :
     func : leaky_relu_grad
   backward : leaky_relu_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : lerp_grad
   forward : lerp (Tensor x, Tensor y, Tensor weight) -> Tensor(out)
@@ -927,6 +1034,7 @@
     param : [x]
   kernel :
     func : log10_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : log1p_grad
   forward : log1p (Tensor x) -> Tensor(out)
@@ -937,6 +1045,7 @@
     param : [x]
   kernel :
     func : log1p_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : log2_grad
   forward : log2 (Tensor x) -> Tensor(out)
@@ -947,6 +1056,7 @@
     param : [x]
   kernel :
     func : log2_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : log_double_grad
   forward : log_grad (Tensor x, Tensor grad_out) -> Tensor(grad_x)
@@ -957,6 +1067,7 @@
     param : [x, x]
   kernel :
     func : log_double_grad
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : log_grad
   forward : log (Tensor x) -> Tensor(out)
@@ -968,6 +1079,7 @@
   kernel :
     func : log_grad
   backward : log_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : log_loss_grad
   forward : log_loss (Tensor input, Tensor label, float epsilon) -> Tensor(out)
@@ -1008,6 +1120,7 @@
     param : [x]
   kernel :
     func : logsigmoid_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : logsumexp_grad
   forward : logsumexp(Tensor x, int64_t[] axis,  bool keepdim,  bool reduce_all) -> Tensor(out)
@@ -1189,6 +1302,7 @@
     param : [x]
   kernel :
     func : mish_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : mode_grad
   forward : mode(Tensor x,  int axis,  bool keepdim) -> Tensor(out), Tensor(indices)
@@ -1242,6 +1356,7 @@
     func : multiply_double_grad
   optional : grad_x_grad, grad_y_grad
   backward : multiply_triple_grad
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : multiply_grad
   forward : multiply (Tensor x, Tensor y) -> Tensor(out)
@@ -1306,6 +1421,15 @@
   kernel :
     func : p_norm_grad
 
+- backward_api : pad3d_double_grad
+  forward : pad3d_grad(Tensor x, Tensor grad_out, IntArray paddings, str mode, float pad_value, str data_format) -> Tensor(grad_x)
+  args : (Tensor grad_x_grad, IntArray paddings, str mode, float pad_value, str data_format)
+  output : Tensor(grad_out_grad)
+  infer_meta :
+    func : Pad3dInferMeta
+  kernel :
+    func : pad3d
+
 - backward_api : pad3d_grad
   forward : pad3d(Tensor x, IntArray paddings, str mode,  float pad_value, str data_format) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, IntArray paddings, str mode,  float pad_value, str data_format)
@@ -1316,6 +1440,29 @@
   kernel :
     func : pad3d_grad
   no_need_buffer : x
+  backward : pad3d_double_grad
+
+- backward_api : pad_double_grad
+  forward : pad_grad(Tensor x, Tensor grad_out, int[] paddings, float pad_value) -> Tensor(grad_x)
+  args : (Tensor grad_x_grad, int[] paddings, float pad_value)
+  output : Tensor(grad_out_grad)
+  infer_meta :
+    func : PadInferMeta
+  kernel :
+    func : pad
+
+- backward_api : pad_grad
+  forward : pad(Tensor x, int[] paddings, float pad_value) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad, int[] paddings, float pad_value)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param: [x]
+  kernel :
+    func : pad_grad
+    param: [out_grad, paddings, pad_value]
+  no_need_buffer : x
+  backward : pad_double_grad
 
 - backward_api : pixel_shuffle_grad
   forward : pixel_shuffle (Tensor x, int upscale_factor, str data_format) -> Tensor(out)
@@ -1386,6 +1533,7 @@
     param: [x]
   kernel :
     func : pow_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : prelu_grad
   forward : prelu(Tensor x, Tensor alpha, str data_format, str mode) -> Tensor(out)
@@ -1435,6 +1583,7 @@
     param : [out]
   kernel :
     func : reciprocal_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : reduce_prod_grad
   forward : reduce_prod (Tensor x, int64_t[] dims, bool keep_dim, bool reduce_all) -> Tensor(out)
@@ -1455,6 +1604,7 @@
     param : [out]
   kernel :
     func : relu_double_grad
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : relu_grad
   forward : relu (Tensor x) -> Tensor(out)
@@ -1466,6 +1616,7 @@
   kernel :
     func : relu_grad
   backward: relu_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : reshape_double_grad
   forward : reshape_grad (Tensor xshape, Tensor grad_out) -> Tensor(grad_x)
@@ -1477,6 +1628,7 @@
   kernel :
     func : reshape_double_grad
   no_need_buffer : grad_out
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : reshape_grad
   forward : reshape (Tensor x, IntArray shape) -> Tensor(out), Tensor(xshape)
@@ -1492,6 +1644,7 @@
     backend: out_grad
     layout: out_grad
   backward : reshape_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : roi_align_grad
   forward : roi_align (Tensor x, Tensor boxes, Tensor boxes_num, int pooled_height, int pooled_width, float spatial_scale, int sampling_ratio, bool aligned) -> Tensor(out)
@@ -1539,6 +1692,18 @@
     param: [out_grad]
   kernel :
     func : round_grad
+  inplace : (out_grad -> x_grad)
+
+- backward_api : rsqrt_double_grad
+  forward : rsqrt_grad (Tensor out, Tensor grad_out) -> Tensor(grad_x)
+  args : (Tensor out, Tensor grad_x, Tensor grad_x_grad)
+  output : Tensor(out_grad), Tensor(grad_out_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [out, out]
+  kernel :
+    func : rsqrt_double_grad
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : rsqrt_grad
   forward : rsqrt (Tensor x) -> Tensor(out)
@@ -1549,6 +1714,8 @@
     param : [out]
   kernel :
     func : rsqrt_grad
+  backward : rsqrt_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : scale_double_grad
   forward : scale_grad (Tensor grad_out, Scalar scale, float bias, bool bias_after_scale) -> Tensor(grad_x)
@@ -1563,6 +1730,7 @@
   output : Tensor(x_grad)
   invoke : scale(out_grad, scale, 0.0, bias_after_scale)
   backward : scale_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : scale_triple_grad
   forward : scale_double_grad (Tensor grad_grad_x, Scalar scale, float bias, bool bias_after_scale) -> Tensor(grad_grad_out)
@@ -1623,6 +1791,7 @@
     param : [x]
   kernel :
     func : sigmoid_cross_entropy_with_logits_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : sigmoid_double_grad
   forward : sigmoid_grad (Tensor out, Tensor fwd_grad_out) -> Tensor(grad_x)
@@ -1634,6 +1803,7 @@
   kernel :
     func : sigmoid_double_grad
   backward : sigmoid_triple_grad
+  inplace : (grad_x_grad -> fwd_grad_out_grad)
 
 - backward_api : sigmoid_grad
   forward : sigmoid (Tensor x) -> Tensor(out)
@@ -1645,6 +1815,7 @@
   kernel :
     func : sigmoid_grad
   backward : sigmoid_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : sigmoid_triple_grad
   forward : sigmoid_double_grad (Tensor out, Tensor fwd_grad_out, Tensor grad_grad_x) -> Tensor(grad_out), Tensor(grad_grad_out)
@@ -1656,6 +1827,7 @@
   kernel :
     func : sigmoid_triple_grad
   optional : grad_grad_out_grad
+  inplace : (grad_grad_x -> fwd_grad_out_grad)
 
 - backward_api : silu_grad
   forward : silu (Tensor x) -> Tensor(out)
@@ -1666,6 +1838,7 @@
     param : [x]
   kernel :
     func : silu_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : sin_grad
   forward : sin (Tensor x) -> Tensor(out)
@@ -1676,6 +1849,7 @@
     param : [x]
   kernel :
     func : sin_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : sinh_grad
   forward : sinh (Tensor x) -> Tensor(out)
@@ -1686,6 +1860,7 @@
     param : [x]
   kernel :
     func : sinh_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : slice_grad
   forward : slice (Tensor input, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis) -> Tensor(out)
@@ -1707,6 +1882,7 @@
     param : [x]
   kernel :
     func : soft_shrink_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : softmax_grad
   forward : softmax (Tensor x, int axis) -> Tensor(out)
@@ -1726,6 +1902,17 @@
   invoke : concat( out_grad, axis)
 # TODO(zhangyunfei) The config of double grad and triple grad will be supported in the future.
 
+- backward_api : sqrt_double_grad
+  forward : sqrt_grad (Tensor out, Tensor grad_out) -> Tensor(grad_x)
+  args : (Tensor out, Tensor grad_x, Tensor grad_x_grad)
+  output : Tensor(out_grad), Tensor(grad_out_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [out, out]
+  kernel :
+    func : sqrt_double_grad
+  inplace : (grad_x_grad -> grad_out_grad)
+
 - backward_api : sqrt_grad
   forward : sqrt (Tensor x) -> Tensor(out)
   args : (Tensor out, Tensor out_grad)
@@ -1735,6 +1922,19 @@
     param : [out]
   kernel :
     func : sqrt_grad
+  backward : sqrt_double_grad
+  inplace : (out_grad -> x_grad)
+
+- backward_api : square_double_grad
+  forward : square_grad (Tensor x, Tensor grad_out) -> Tensor(grad_x)
+  args : (Tensor x, Tensor grad_out, Tensor grad_x_grad)
+  output : Tensor(x_grad), Tensor(grad_out_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, x]
+  kernel :
+    func : square_double_grad
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : square_grad
   forward : square (Tensor x) -> Tensor(out)
@@ -1745,6 +1945,14 @@
     param : [x]
   kernel :
     func : square_grad
+  backward : square_double_grad
+  inplace : (out_grad -> x_grad)
+
+- backward_api : squeeze_double_grad
+  forward : squeeze_grad(Tensor xshape, Tensor grad_out, int[] axes) -> Tensor(grad_x)
+  args : (Tensor grad_x_grad, int[] axes)
+  output : Tensor(grad_out_grad)
+  invoke: squeeze(grad_x_grad, axes)
 
 - backward_api : squeeze_grad
   forward : squeeze(Tensor x, int[] axes) -> Tensor(out), Tensor(xshape)
@@ -1755,6 +1963,8 @@
     param: [xshape]
   kernel :
     func : squeeze_grad
+  inplace : (out_grad -> x_grad)
+  backward: squeeze_double_grad
 
 - backward_api : stack_grad
   forward : stack (Tensor[] x, int axis) -> Tensor(out)
@@ -1790,6 +2000,7 @@
     func : subtract_double_grad
   optional : grad_x_grad, grad_y_grad
   no_need_buffer : y, grad_out
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : subtract_grad
   forward : subtract (Tensor x, Tensor y) -> Tensor(out)
@@ -1802,6 +2013,7 @@
     func : subtract_grad
   no_need_buffer : x, y
   backward : subtract_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : sum_double_grad
   forward : sum_grad (Tensor x, Tensor grad_out, int64_t[] dims, bool keep_dim, bool reduce_all=false) -> Tensor(grad_x)
@@ -1837,6 +2049,7 @@
     param : [x]
   kernel :
     func : swish_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : take_along_axis_grad
   forward : take_along_axis (Tensor x, Tensor index, int axis) -> Tensor(out)
@@ -1857,6 +2070,7 @@
     param : [x]
   kernel :
     func : tan_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : tanh_double_grad
   forward : tanh_grad (Tensor out, Tensor grad_out) -> Tensor(grad_x)
@@ -1868,6 +2082,7 @@
   kernel :
     func : tanh_double_grad
   backward : tanh_triple_grad
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : tanh_grad
   forward : tanh (Tensor x) -> Tensor(out)
@@ -1879,6 +2094,7 @@
   kernel :
     func : tanh_grad
   backward : tanh_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : tanh_shrink_grad
   forward : tanh_shrink (Tensor x) -> Tensor(out)
@@ -1889,6 +2105,7 @@
     param : [x]
   kernel :
     func : tanh_shrink_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : tanh_triple_grad
   forward : tanh_double_grad (Tensor out, Tensor grad_out_forward, Tensor grad_x_grad_forward) -> Tensor(grad_out_new), Tensor(grad_out_grad)
@@ -1899,6 +2116,7 @@
     param : [out, out, grad_x_grad_forward]
   kernel :
     func : tanh_triple_grad
+  inplace : (grad_x_grad_forward -> grad_out_forward_grad)
 
 - backward_api : thresholded_relu_grad
   forward : thresholded_relu (Tensor x, float threshold) -> Tensor(out)
@@ -1909,6 +2127,7 @@
     param : [x]
   kernel :
     func : thresholded_relu_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : tile_double_grad
   forward : tile_grad (Tensor x, Tensor grad_out, IntArray repeat_times) -> Tensor(grad_x)
@@ -2016,15 +2235,24 @@
     func : unfold_grad
   no_need_buffer : x
 
+- backward_api : unsqueeze_double_grad
+  forward : unsqueeze_grad(Tensor xshape, Tensor grad_out, IntArray axes) -> Tensor(grad_x)
+  args : (Tensor grad_x_grad, IntArray axes)
+  output : Tensor(grad_out_grad)
+  invoke : unsqueeze(grad_x_grad, axes)
+
 - backward_api : unsqueeze_grad
   forward : unsqueeze(Tensor x, IntArray axes) -> Tensor(out), Tensor(xshape)
-  args : (Tensor xshape, Tensor out_grad)
+  args : (Tensor xshape, Tensor out_grad, IntArray axes)
   output : Tensor(x_grad)
   infer_meta :
     func : KernelWithXShapeInferMeta
     param: [xshape]
   kernel :
     func : unsqueeze_grad
+    param: [xshape, out_grad]
+  inplace : (out_grad -> x_grad)
+  backward : unsqueeze_double_grad
 
 - backward_api : where_grad
   forward : where (Tensor condition, Tensor x, Tensor y) -> Tensor(out)
diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/python/paddle/utils/code_gen/backward_api_gen.py
index 502c221952fb4..886748eeb290e 100644
--- a/python/paddle/utils/code_gen/backward_api_gen.py
+++ b/python/paddle/utils/code_gen/backward_api_gen.py
@@ -209,7 +209,6 @@ def source_include(header_file_path):
 #include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/api/include/api.h"
 #include "paddle/phi/infermeta/backward.h"
diff --git a/python/paddle/utils/code_gen/intermediate_api_gen.py b/python/paddle/utils/code_gen/intermediate_api_gen.py
index 2df3ac643614e..4e4875b596192 100644
--- a/python/paddle/utils/code_gen/intermediate_api_gen.py
+++ b/python/paddle/utils/code_gen/intermediate_api_gen.py
@@ -44,7 +44,6 @@ def source_include(header_file_path):
 #include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/sparse_api_custom_impl.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/infermeta/binary.h"
 #include "paddle/phi/infermeta/multiary.h"
diff --git a/python/paddle/utils/code_gen/type_mapping.py b/python/paddle/utils/code_gen/type_mapping.py
index ecbd1f494c2ee..c6e110907a9f7 100644
--- a/python/paddle/utils/code_gen/type_mapping.py
+++ b/python/paddle/utils/code_gen/type_mapping.py
@@ -108,7 +108,7 @@
 sr_input_types_map = {'Tensor': 'const phi::SelectedRows&', }
 
 sr_optional_input_types_map = {
-    'Tensor': 'paddle::optional<const phi::SelectedRows&>',
+    'Tensor': 'const paddle::optional<phi::SelectedRows>&',
 }
 
 sr_output_types_map = {'Tensor': 'phi::SelectedRows*', }
diff --git a/python/paddle/utils/code_gen/wrapped_infermeta_gen.py b/python/paddle/utils/code_gen/wrapped_infermeta_gen.py
index c14d39e9842be..bf798f9734d53 100644
--- a/python/paddle/utils/code_gen/wrapped_infermeta_gen.py
+++ b/python/paddle/utils/code_gen/wrapped_infermeta_gen.py
@@ -43,10 +43,7 @@ def gene_wrapped_infermeta_and_register(api):
                 'const std::vector<Tensor>&': 'const std::vector<MetaTensor>&',
                 'Tensor': 'MetaTensor*',
                 'std::vector<Tensor>': 'std::vector<MetaTensor>*',
-                'const paddle::optional<Tensor&>':
-                'const paddle::optional<MetaTensor&>',
-                'paddle::optional<const Tensor&>':
-                'paddle::optional<const MetaTensor&>'
+                'const paddle::optional<Tensor>&': 'const MetaTensor&'
             }
 
             wrapped_infermeta_name = get_wrapped_infermeta_name(api.api)
diff --git a/python/paddle/vision/transforms/functional.py b/python/paddle/vision/transforms/functional.py
index 90fba1c4130e5..7927e9faee370 100644
--- a/python/paddle/vision/transforms/functional.py
+++ b/python/paddle/vision/transforms/functional.py
@@ -380,6 +380,7 @@ def adjust_brightness(img, brightness_factor):
 
     Examples:
         .. code-block:: python
+           :name: code-example1
 
             import numpy as np
             from PIL import Image
@@ -388,9 +389,13 @@ def adjust_brightness(img, brightness_factor):
             fake_img = (np.random.rand(256, 300, 3) * 255.).astype('uint8')
 
             fake_img = Image.fromarray(fake_img)
+            print(fake_img.size) # (300, 256)
+            print(fake_img.load()[1,1]) # (95, 127, 202)
+            converted_img = F.adjust_brightness(fake_img, 0.5)
+            print(converted_img.size) # (300, 256)
+            print(converted_img.load()[1,1]) # (47, 63, 101)
+
 
-            converted_img = F.adjust_brightness(fake_img, 0.4)
-            print(converted_img.size)
     """
     if not (_is_pil_image(img) or _is_numpy_image(img) or
             _is_tensor_image(img)):
diff --git a/python/paddle/vision/transforms/transforms.py b/python/paddle/vision/transforms/transforms.py
index fea2efb1fb2b1..31f56e890558c 100644
--- a/python/paddle/vision/transforms/transforms.py
+++ b/python/paddle/vision/transforms/transforms.py
@@ -1042,14 +1042,32 @@ class RandomCrop(BaseTransform):
         size (sequence|int): Desired output size of the crop. If size is an
             int instead of sequence like (h, w), a square crop (size, size) is
             made.
-        padding (int|sequence|optional): Optional padding on each border
+        padding (int|sequence, optional): Optional padding on each border
             of the image. If a sequence of length 4 is provided, it is used to pad left, 
-            top, right, bottom borders respectively. Default: 0.
-        pad_if_needed (boolean|optional): It will pad the image if smaller than the
+            top, right, bottom borders respectively. Default: None, without padding.
+        pad_if_needed (boolean, optional): It will pad the image if smaller than the
             desired size to avoid raising an exception. Default: False.
+        fill (float|tuple, optional): Pixel fill value for constant fill. If a tuple of
+            length 3, it is used to fill R, G, B channels respectively.
+            This value is only used when the padding_mode is constant. Default: 0.
+        padding_mode: Type of padding. Should be: constant, edge, reflect or symmetric. Default: 'constant'.
+
+            - constant: pads with a constant value, this value is specified with fill
+
+            - edge: pads with the last value on the edge of the image
+
+            - reflect: pads with reflection of image (without repeating the last value on the edge)
+
+                   padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
+                   will result in [3, 2, 1, 2, 3, 4, 3, 2]
+
+            - symmetric: pads with reflection of image (repeating the last value on the edge)
+
+                     padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
+                     will result in [2, 1, 1, 2, 3, 4, 4, 3]
         keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
     
-    Shape:
+    Shape
         - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
         - output(PIL.Image|np.ndarray|Paddle.Tensor): A random cropped image.
 
@@ -1059,17 +1077,17 @@ class RandomCrop(BaseTransform):
     Examples:
     
         .. code-block:: python
+          :name: code-example1
 
-            import numpy as np
-            from PIL import Image
+            import paddle
             from paddle.vision.transforms import RandomCrop
-
             transform = RandomCrop(224)
 
-            fake_img = Image.fromarray((np.random.rand(324, 300, 3) * 255.).astype(np.uint8))
+            fake_img = paddle.randint(0, 255, shape=(3, 324,300), dtype = 'int32')
+            print(fake_img.shape) # [3, 324, 300]
 
-            fake_img = transform(fake_img)
-            print(fake_img.size)
+            crop_img = transform(fake_img)
+            print(crop_img.shape) # [3, 224, 224]
     """
 
     def __init__(self,
diff --git a/python/requirements.txt b/python/requirements.txt
index e7fc6cd651cb0..74f2c2b9401aa 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -1,9 +1,9 @@
 requests>=2.20.0
 numpy>=1.13
-protobuf>=3.1.0
+protobuf>=3.1.0, <=3.20.0
 Pillow
 six
 decorator
 astor
-paddle_bfloat==0.1.2
+paddle_bfloat==0.1.7
 opt_einsum==3.3.0
diff --git a/python/setup.py.in b/python/setup.py.in
index c1a6e3d3947a9..2a0d745729aab 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -281,6 +281,7 @@ packages=['paddle',
           'paddle.incubate.tensor',
           'paddle.incubate.multiprocessing',
           'paddle.incubate.nn',
+          'paddle.incubate.asp',
           'paddle.incubate.passes',
           'paddle.distribution',
           'paddle.distributed.sharding',
diff --git a/python/unittest_py/requirements.txt b/python/unittest_py/requirements.txt
index 7a48ff0148e76..ea82c46b95c5e 100644
--- a/python/unittest_py/requirements.txt
+++ b/python/unittest_py/requirements.txt
@@ -8,7 +8,7 @@ pygame==2.1.0
 hypothesis
 opencv-python<=4.2.0.32
 visualdl
-paddle2onnx>=0.8.2
+paddle2onnx>=0.9.6
 scipy>=1.6; python_version >= "3.7"
 scipy>=1.5; python_version == "3.6"
 prettytable
diff --git a/tools/dockerfile/Dockerfile.ipu b/tools/dockerfile/Dockerfile.ipu
index ee2d984035624..8f1948de8a4dc 100644
--- a/tools/dockerfile/Dockerfile.ipu
+++ b/tools/dockerfile/Dockerfile.ipu
@@ -6,7 +6,7 @@
 # run a container
 # docker run --ulimit memlock=-1:-1 --net=host --cap-add=IPC_LOCK --device=/dev/infiniband/ --ipc=host --rm -it paddlepaddle/paddle:latest-dev-ipu bash
 
-FROM graphcore/poplar-extbaidu:2.5.0-ubuntu-18.04-20220407
+FROM graphcore/poplar:2.5.1
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 
 # ENV variables
diff --git a/tools/dockerfile/ci_dockerfile.sh b/tools/dockerfile/ci_dockerfile.sh
index ed13ca8762500..485bfd7968f05 100644
--- a/tools/dockerfile/ci_dockerfile.sh
+++ b/tools/dockerfile/ci_dockerfile.sh
@@ -20,7 +20,7 @@ function make_ubuntu_dockerfile(){
   sed -i "s#liblzma-dev#liblzma-dev openmpi-bin openmpi-doc libopenmpi-dev#g" ${dockerfile_name} 
   dockerfile_line=$(wc -l ${dockerfile_name}|awk '{print $1}')
   sed -i "${dockerfile_line}i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz \&\& \
-     tar -xzf  hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name} 
+     tar -xzf hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name} 
   sed -i "${dockerfile_line}i RUN apt remove git -y \&\& apt install -y libcurl4-openssl-dev gettext zstd \&\& wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz \&\& \
     tar -xvf git-2.17.1.tar.gz \&\& \
     cd git-2.17.1 \&\& \
@@ -38,7 +38,7 @@ function make_ubuntu_dockerfile(){
     ENV PATH=/usr/local/gcc-8.2/bin:\$PATH #g" ${dockerfile_name}
   sed -i "s#bash /build_scripts/install_nccl2.sh#wget -q --no-proxy https://nccl2-deb.cdn.bcebos.com/nccl-repo-ubuntu1604-2.7.8-ga-cuda10.1_1-1_amd64.deb \\
     RUN dpkg -i nccl-repo-ubuntu1604-2.7.8-ga-cuda10.1_1-1_amd64.deb \\
-    RUN apt update \&\& apt remove -y libnccl* --allow-change-held-packages \&\&  apt-get install -y libnccl2=2.7.8-1+cuda10.1 libnccl-dev=2.7.8-1+cuda10.1 pigz --allow-change-held-packages #g" ${dockerfile_name}
+    RUN apt remove -y libnccl* --allow-change-held-packages \&\&  apt-get install -y libnccl2=2.7.8-1+cuda10.1 libnccl-dev=2.7.8-1+cuda10.1 pigz --allow-change-held-packages #g" ${dockerfile_name}
 }
 
 
diff --git a/tools/nvcc_lazy b/tools/nvcc_lazy
new file mode 100755
index 0000000000000..9cb49b04ffaff
--- /dev/null
+++ b/tools/nvcc_lazy
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+## CUDA_MODULE_LOADING=EAGER,DEFAULT,LAZY
+
+# check nvcc version, if nvcc >= 11.6, just run nvcc itself
+CUDA_VERSION=$(nvcc --version | grep -oP '(?<=cuda_)\d*\.\d*')
+CUDA_VERSION_MAJOR=${CUDA_VERSION%.*}
+CUDA_VERSION_MINOR=${CUDA_VERSION#*.}
+if (( CUDA_VERSION_MAJOR > 11 || (CUDA_VERSION_MAJOR == 11 && CUDA_VERSION_MINOR >= 6) )); then
+  nvcc "$@"
+  exit
+fi
+
+BUILDDIR=$(mktemp -d  /tmp/nvcc-lazy-build.XXXXXXXX)
+echo "$@" > ${BUILDDIR}/args
+BUILDSH=${BUILDDIR}/build.sh
+/usr/local/cuda/bin/nvcc --dryrun --keep --keep-dir=${BUILDDIR} "$@" 2>&1 | sed -e 's/#\$ //;/^rm/d' > $BUILDSH
+sed -i -e '/^\s*--/d' $BUILDSH
+sed -ne '1,/^cicc.*cudafe1.stub.c/p' ${BUILDSH} > ${BUILDSH}.pre
+sed -e '1,/^cicc.*cudafe1.stub.c/d' ${BUILDSH} > ${BUILDSH}.post
+
+sed -i -e '/LIBRARIES=/{s/\s//g;s/""/ /g}' ${BUILDSH}.pre
+
+/usr/bin/env bash ${BUILDSH}.pre
+STUBF=$(find $BUILDDIR -name *.cudafe1.stub.c)
+CUFILE=$(basename -s '.cudafe1.stub.c' $STUBF)
+sed -i -e '/__sti____cudaRegisterAll.*__attribute__/a static void __try____cudaRegisterAll(int);' $STUBF
+sed -i -e 's/__sti____cudaRegisterAll\(.*{\)/__do____cudaRegisterAll\1/' $STUBF
+# sed -i -e "/__do____cudaRegisterAll\(.*{\)/a static void __try____cudaRegisterAll(int l){static int _ls = 0; if (_ls) return; const char* lm = getenv(\"CUDA_MODULE_LOADING\"); if (lm&&(lm[0]=='L')&&(lm[1]=='A')&&(lm[2]=='Z')&&(lm[3]=='Y')&&(l!=1)) return; _ls = 1; fprintf(stderr,\"===> ${CUFILE} lazy-load? %d\\\\n\", l); __do____cudaRegisterAll();}" $STUBF
+sed -i -e "/__do____cudaRegisterAll\(.*{\)/a static void __try____cudaRegisterAll(int l){static int _ls = 0; if (_ls) return; const char* lm = getenv(\"CUDA_MODULE_LOADING\"); if (lm&&(lm[0]=='L')&&(lm[1]=='A')&&(lm[2]=='Z')&&(lm[3]=='Y')&&(l!=1)) return; _ls = 1; __do____cudaRegisterAll();}" $STUBF
+sed -i -e '/__try____cudaRegisterAll\(.*{\)/a static void __sti____cudaRegisterAll(void){__try____cudaRegisterAll(0);}' $STUBF
+sed -i -e 's/{\(__device_stub__\)/{__try____cudaRegisterAll(1);\1/' $STUBF
+/usr/bin/env bash ${BUILDSH}.post
+rm -rf $BUILDDIR
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 5088ad3457fb9..7c43ef1a6d2e3 100755
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -824,7 +824,7 @@
     'test_mean_op', 'test_is_tensor', 'test_run_program_op',
     'test_cuda_random_seed', 'test_linear_interp_op',
     'test_fuse_all_reduce_pass', 'tensor_util_test', 'test_median',
-    'test_linear', 'test_imperative_qat_amp',
+    'test_nanmedian', 'test_linear', 'test_imperative_qat_amp',
     'test_truncated_gaussian_random_op', 'test_lstm_cudnn_op',
     'copy_same_tensor_test', 'test_squeeze2_op',
     'naive_best_fit_allocator_test', 'test_model', 'test_py_reader_combination',
@@ -2047,6 +2047,8 @@
     'test_lambda',
     'test_prod_op',
     'test_fused_attention_op_api',
+    'test_fused_bias_dropout_residual_layer_norm_op',
+    'test_fused_bias_dropout_residual_layer_norm_op_api',
     'test_complex_grad_accumulated',
     'test_deg2rad',
     'test_lgamma_op',
diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py
index 2d8692c5bc7e5..1bd9f029d552c 100644
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -339,7 +339,9 @@ def sampcd_extract_to_file(srccom, name, htype="def", hname=""):
 Please use '.. code-block:: python' to format the sample code.""")
                 return []
         else:
-            logger.warning("Error: No sample code!")
+            logger.error(
+                "Error: No sample code found! Please check if the API comment contais string 'Examples:' correctly"
+            )
             return []
 
     sample_code_filenames = []
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index 6067b40f0a7c1..95c5ecf713112 100755
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -399,6 +399,7 @@
     'test_positive_negative_pair_op',
     'test_precision_recall_op',
     'test_prelu_op',
+    'test_rrelu_op',
     'test_prelu_mkldnn_op',
     'test_print_op',
     'test_prior_box_op',