diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 39d9ae5e0dcd7..06b9dfeb725da 100755
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
 -   repo: https://github.com/Lucas-C/pre-commit-hooks.git
-    sha: v1.0.1
+    rev: v1.1.14
     hooks:
     -   id: remove-crlf
         files: (?!.*third_party)^.*$ | (?!.*book)^.*$
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 43c2208182a55..92a526a2b58a7 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -52,6 +52,7 @@ if(NOT DEFINED CBLAS_PROVIDER)
   set(OPENBLAS_INCLUDE_SEARCH_PATHS
           ${OPENBLAS_ROOT}/include
           /usr/include
+          /usr/include/lapacke
           /usr/include/openblas
           /usr/local/opt/openblas/include)
   set(OPENBLAS_LIB_SEARCH_PATHS
@@ -75,7 +76,7 @@ if(NOT DEFINED CBLAS_PROVIDER)
     string(REGEX MATCH "OpenBLAS ([0-9]+\.[0-9]+\.[0-9]+)" tmp ${config_file})
     string(REGEX MATCH "([0-9]+\.[0-9]+\.[0-9]+)" ver ${tmp})
     
-    if (${ver} VERSION_GREATER_EQUAL "0.3.7")
+    if (${ver} VERSION_GREATER_EQUAL "0.3.5")
       set(CBLAS_PROVIDER OPENBLAS)
       set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR} ${OPENBLAS_LAPACKE_INC_DIR})
       set(CBLAS_LIBRARIES ${OPENBLAS_LIB})
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 43d5002fe3819..d9b302e9ed34d 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -9,7 +9,7 @@ SET(XPU_RT_LIB_NAME             "libxpurt.so")
 
 if(NOT DEFINED XPU_BASE_URL)
   SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220520")
+  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220601")
 else()
   SET(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()
@@ -17,7 +17,7 @@ endif()
 # ubuntu and centos: use output by XDNN API team
 if(NOT DEFINED XPU_XDNN_BASE_URL)
   SET(XPU_XDNN_BASE_URL_WITHOUT_DATE "https://klx-sdk-release-public.su.bcebos.com/xdnn/dev")
-  SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220520")
+  SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220601")
 else()
   SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL}")
 endif()
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index 3edd13ccd597f..817a0de6e0ca9 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -231,6 +231,15 @@ class GradNodeGenerationInfo {
       return &no_need_buffer_ins_;
     }
 
+    const std::unordered_map<std::string, std::string>& GetBackwardInplaceMap()
+        const {
+      return backward_inplace_map_;
+    }
+    std::unordered_map<std::string, std::string>*
+    GetMutableBackwardInplaceMap() {
+      return &backward_inplace_map_;
+    }
+
    private:
     std::string op_base_type_;
     std::map<std::string, std::string> grad_outs_slotname_map_;
@@ -244,6 +253,7 @@ class GradNodeGenerationInfo {
         grad_outs_;
     paddle::framework::AttributeMap grad_attrs_;
     std::unordered_set<std::string> no_need_buffer_ins_;
+    std::unordered_map<std::string, std::string> backward_inplace_map_;
   };
 
  public:
@@ -979,6 +989,12 @@ static bool CollectGradInformationFromOpInfo(
       *(*op_base_infos)[index].GetMutableNoNeedBufferInputs() =
           inferer(g_ins, g_outs, *op_base_grad_attrs);
     }
+
+    auto& infer_backward_inplace = op_base.Info().infer_inplace_;
+    if (infer_backward_inplace) {
+      *(*op_base_infos)[index].GetMutableBackwardInplaceMap() =
+          infer_backward_inplace(true);
+    }
   }
 
   /* ------ Slot Name Matching ---- */
@@ -1005,7 +1021,7 @@ static std::string GenerateGradNodeCreationContent(
     const ForwardGenerationInfo& fwd_info,
     const GradNodeGenerationInfo& bwd_info,
     const std::string& trace_op_body_str,
-    std::map<std::string, std::string> inplace_map = {}) {
+    std::map<std::string, std::string> forward_inplace_map = {}) {
   VLOG(6) << "Generating GradNode Creation codes";
 
   const std::string& op_type = fwd_info.GetOpType();
@@ -1045,8 +1061,10 @@ static std::string GenerateGradNodeCreationContent(
     } else {
       // In inplace op, the case where output is duplicable is not considered.
       // Replace output directly with input in inplace op.
-      if (!inplace_map.empty() && inplace_map.count(output_name)) {
-        auto inplace_input_name = LegalizeVarName(inplace_map[output_name]);
+      if (!forward_inplace_map.empty() &&
+          forward_inplace_map.count(output_name)) {
+        auto inplace_input_name =
+            LegalizeVarName(forward_inplace_map[output_name]);
         const std::string& inplace_input_autograd_name =
             "p_autograd_" + inplace_input_name;
         const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
@@ -1103,12 +1121,12 @@ static std::string GenerateGradNodeCreationContent(
   // check inplace input to avoid inplace operations on leaf nodes with
   // stop_gradient=False.
   std::string check_inplace_str = "";
-  if (!inplace_map.empty()) {
+  if (!forward_inplace_map.empty()) {
     const char* CHECKING_INPLACE_TEMPLATE =
         "  // Check Inplace\n"
         "  egr::EagerUtils::CheckInplace(%s, p_autograd_%s, "
         "require_any_grad);\n";
-    for (auto& inplace_pair : inplace_map) {
+    for (auto& inplace_pair : forward_inplace_map) {
       std::string inplace_name = LegalizeVarName(inplace_pair.second);
       check_inplace_str += paddle::string::Sprintf(CHECKING_INPLACE_TEMPLATE,
                                                    inplace_name, inplace_name);
@@ -1161,8 +1179,9 @@ static std::string GenerateGradNodeCreationContent(
       const char* SET_TENSOR_WRAPPER_TEMPLATE =
           "      grad_node->SetTensorWrapper%s(%s);\n";
       // Replace output directly with input in inplace op.
-      if (!inplace_map.empty() && inplace_map.count(tensor_wrapper_name)) {
-        auto inplace_input_name = inplace_map[tensor_wrapper_name];
+      if (!forward_inplace_map.empty() &&
+          forward_inplace_map.count(tensor_wrapper_name)) {
+        auto inplace_input_name = forward_inplace_map[tensor_wrapper_name];
         grad_node_creation_str += paddle::string::Sprintf(
             SET_TENSOR_WRAPPER_TEMPLATE, LegalizeVarName(tensor_wrapper_name),
             LegalizeVarName(inplace_input_name));
@@ -1213,8 +1232,9 @@ static std::string GenerateGradNodeCreationContent(
   for (const proto::OpProto::Var& output : out_vars) {
     const std::string& output_name = output.name();
     // Replace output directly with input in inplace op.
-    if (!inplace_map.empty() && inplace_map.count(output_name)) {
-      auto inplace_input_name = inplace_map[output_name];
+    if (!forward_inplace_map.empty() &&
+        forward_inplace_map.count(output_name)) {
+      auto inplace_input_name = forward_inplace_map[output_name];
       const std::string& inplace_input_autograd_name =
           "p_autograd_" + LegalizeVarName(inplace_input_name);
       size_t output_position = fwd_outputs_name_pos_map.at(output_name);
@@ -1345,7 +1365,7 @@ static std::string GenerateGradNodeCreationContent(
 static std::pair<std::string, std::string> GenerateForwardFunctionContents(
     const ForwardGenerationInfo& fwd_info,
     const GradNodeGenerationInfo& bwd_info,
-    std::map<std::string, std::string> inplace_map = {}) {
+    std::map<std::string, std::string> forward_inplace_map = {}) {
   /* --- Process Forward Info ---*/
   const std::string& op_type = fwd_info.GetOpType();
   const std::unordered_map<std::string, size_t>& fwd_inputs_name_pos_map =
@@ -1434,8 +1454,8 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
       // inplace tensor can't be const
       const char* FWD_INS_ARG_TEMPLATE;
       bool flag_find_input_name = false;
-      if (!inplace_map.empty()) {
-        for (auto& inplace_pair : inplace_map) {
+      if (!forward_inplace_map.empty()) {
+        for (auto& inplace_pair : forward_inplace_map) {
           if (inplace_pair.second == input_name) {
             flag_find_input_name = true;
             FWD_INS_ARG_TEMPLATE = "paddle::experimental::Tensor& %s";
@@ -1605,15 +1625,16 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
       }
       core_ops_args_info[op_type].push_back(output_name);
 
-    } else if (!inplace_map.empty() && inplace_map.count(output_name)) {
+    } else if (!forward_inplace_map.empty() &&
+               forward_inplace_map.count(output_name)) {
       // In inplace op, replace the output with the input directly.
       PADDLE_ENFORCE_NE(
-          inplace_map[output_name], "",
+          forward_inplace_map[output_name], "",
           paddle::platform::errors::InvalidArgument(
               "Inplace op %s has no input corresponding to output %s.", op_type,
               output_name));
       const char* FWD_OUTS_CONTENT_TEMPLATE = "{ \"%s\", ins[\"%s\"] },";
-      auto inplace_input_name = inplace_map[output_name];
+      auto inplace_input_name = forward_inplace_map[output_name];
       outs_contents_str += paddle::string::Sprintf(
           FWD_OUTS_CONTENT_TEMPLATE, output_name, inplace_input_name);
 
@@ -1651,7 +1672,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   if (inplace_mapping_str.size() > 0)
     inplace_mapping_str.pop_back();  // Remove trailing ","
 
-  if ((op_type != "cast") && (inplace_map.empty())) {
+  if ((op_type != "cast") && (forward_inplace_map.empty())) {
     VLOG(6) << "Generating Dygraph Forward AMP";
     const char* AMP_LOGIC_CONTEXT =
         "  if (egr::Controller::Instance().GetAMPLevel() != "
@@ -1743,7 +1764,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   VLOG(6) << "Generated Outs Map";
 
   // [Generation] Apply View Strategy (Tensor)
-  if (inplace_map.empty() && view_op_map.count(op_type)) {
+  if (forward_inplace_map.empty() && view_op_map.count(op_type)) {
     const char* HANDLE_VIEW_BETWEEN_INPUT_AND_OUTPUT =
         "  if (ins.count(\"%s\") && outs.count(\"%s\")) {\n"
         "    egr::EagerUtils::HandleViewBetweenInputAndOutput(ins[\"%s\"][0], "
@@ -1852,10 +1873,11 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
               output_varname, output_var_args_name);
         }
       } else {
-        if (!inplace_map.empty() && inplace_map.count(output_name)) {
+        if (!forward_inplace_map.empty() &&
+            forward_inplace_map.count(output_name)) {
           // Modify meta info of inplace tensor.
           // Bump inplace version of inplace tensor.
-          auto inplace_input_name = inplace_map[output_name];
+          auto inplace_input_name = forward_inplace_map[output_name];
           const char* FWD_OUT_TENSOR_TEMPLATE =
               "  egr::EagerUtils::GetOutput(outs[\"%s\"][0], &%s);\n"
               "  %s.bump_inplace_version();\n"
@@ -1878,10 +1900,11 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
       return_types[return_position] = "paddle::experimental::Tensor";
     }
 
-    if (!inplace_map.empty() && inplace_map.count(output_name)) {
+    if (!forward_inplace_map.empty() &&
+        forward_inplace_map.count(output_name)) {
       // Replace output directly with input in inplace op.
       return_contents[return_position] =
-          LegalizeVarName(inplace_map[output_name]);
+          LegalizeVarName(forward_inplace_map[output_name]);
     } else {
       return_contents[return_position] = output_varname;
     }
@@ -1903,7 +1926,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
     // If GradNode needs to be generated, pass `trace_op_body_str`
     // into `GenerateGradNodeCreationContent`.
     std::string grad_node_creation_body_str = GenerateGradNodeCreationContent(
-        fwd_info, bwd_info, trace_op_body_str, inplace_map);
+        fwd_info, bwd_info, trace_op_body_str, forward_inplace_map);
 
     generated_function_body += grad_node_creation_body_str;
     generated_function_body += "\n";
@@ -1960,7 +1983,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
 
   // [Generation] Get Full Function
   std::string function_name;
-  if (inplace_map.empty()) {
+  if (forward_inplace_map.empty()) {
     function_name = op_type + "_dygraph_function";
   } else {
     // change function_name for inplace op.
@@ -2013,12 +2036,39 @@ static std::string GenerateSingleOpBase(
         std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>&
         grad_outs,
     const paddle::framework::AttributeMap& grad_attrs,
+    const std::unordered_map<std::string, std::string>& backward_inplace_map,
     bool is_op_base_per_duplicable_input, size_t* outs_size) {
   std::string generated_grad_function_body = "";
 
   const std::string& ins_name = "ins" + std::to_string(*outs_size);
   const std::string& outs_name = "outs" + std::to_string(*outs_size);
   const std::string& attrs_name = "attrs_map" + std::to_string(*outs_size);
+  const std::string& hooked_grads = "hooked_grads" + std::to_string(*outs_size);
+
+  // [Generation] Get Full Zero
+  std::string fill_zero_str = "";
+  if (ops_to_fill_zero_for_empty_grads.count(fwd_op_type)) {
+    for (auto iter : grad_ins) {
+      const std::string& grad_input_name = iter.first;
+      if (grad_ins_grad_slotname_map.count(grad_input_name)) {
+        size_t fwd_output_position = fwd_outputs_name_pos_map.at(
+            grad_ins_grad_slotname_map.at(grad_input_name));
+        const char* FILL_ZERO_TEMPLATE =
+            "egr::EagerUtils::FillZeroForEmptyOptionalGradInput(&grads[%d], "
+            "this->InputMeta()[%d]);\n";
+        fill_zero_str += paddle::string::Sprintf(
+            FILL_ZERO_TEMPLATE, fwd_output_position, fwd_output_position);
+      }
+    }
+  }
+  generated_grad_function_body += fill_zero_str;
+  generated_grad_function_body +=
+      "  paddle::small_vector<std::vector<paddle::experimental::Tensor>, "
+      "egr::kSlotSmallVectorSize> " +
+      hooked_grads +
+      " = "
+      "GradNode" +
+      fwd_op_type + "::ApplyGradientHooks(grads);\n";
 
   // [Generation] Get Ins Map
   std::unordered_set<std::string> dispensable_input_name_set;
@@ -2029,6 +2079,23 @@ static std::string GenerateSingleOpBase(
   for (const auto& in : in_vars) {
     if (in.duplicable()) duplicable_input_name_set.insert(in.name());
   }
+  const char* CHECK_BACKWARD_INPLACE_TEMPLATE =
+      "  // Check backward inplace info\n"
+      "  bool %s = false;\n"
+      "  %s\n"
+      "  if (%s.initialized()) {\n"
+      "    VLOG(10) << %s.name() << \"(%s) use_count: \" << "
+      "%s.impl().use_count();\n"
+      "    if (%s.impl().use_count() == 1 || (%s.impl().use_count() == 2 && "
+      "%s.impl().get() == %s.impl().get())) {\n"
+      "      %s = true;\n"
+      "    }\n"
+      "  }\n";
+  const std::string& can_be_inplaced_name =
+      "can_be_inplaced" + std::to_string(*outs_size);
+  const std::string& bwd_inplace_input_name =
+      "backward_inplace_tensor" + std::to_string(*outs_size);
+  bool process_backward_inplace = false;
   std::string ins_contents_str = "";
   for (auto iter : grad_ins) {
     const std::string& grad_input_name = iter.first;
@@ -2051,16 +2118,52 @@ static std::string GenerateSingleOpBase(
       ins_contents_str +=
           paddle::string::Sprintf(GRAD_INS_FWD_CONTENT_TEMPLATE,
                                   grad_input_name, struct_fwd_input_name);
-
+      if (!backward_inplace_map.empty() &&
+          backward_inplace_map.count(grad_input_name)) {
+        process_backward_inplace = true;
+        const char* GRAD_INS_FWD_TENSOR_WRAPPER_TEMPLATE =
+            "auto %s = egr::EagerUtils::RecoverTensorWrapper(&this->%s);";
+        std::string tensor_wrapper_str = paddle::string::Sprintf(
+            GRAD_INS_FWD_TENSOR_WRAPPER_TEMPLATE, bwd_inplace_input_name,
+            struct_fwd_input_name);
+        const char* GRAD_INS_FWD_TENSOR_TEMPLATE =
+            "(&this->%s)->get_intermidiate_tensor()";
+        std::string tensor_wrapper_intermidiate_tensor_str =
+            paddle::string::Sprintf(GRAD_INS_FWD_TENSOR_TEMPLATE,
+                                    struct_fwd_input_name);
+        generated_grad_function_body += paddle::string::Sprintf(
+            CHECK_BACKWARD_INPLACE_TEMPLATE, can_be_inplaced_name,
+            tensor_wrapper_str, bwd_inplace_input_name, bwd_inplace_input_name,
+            grad_input_name, bwd_inplace_input_name, bwd_inplace_input_name,
+            bwd_inplace_input_name, bwd_inplace_input_name,
+            tensor_wrapper_intermidiate_tensor_str, can_be_inplaced_name);
+      }
     } else if (grad_ins_grad_slotname_map.count(grad_input_name)) {
       // Fwd Tensor's Grad
       size_t fwd_output_position = fwd_outputs_name_pos_map.at(
           grad_ins_grad_slotname_map.at(grad_input_name));
       const char* GRAD_INS_GRAD_CONTENT_TEMPLATE =
-          "{ \"%s\", egr::EagerUtils::TrySyncToVars(hooked_grads[%d]) },";
+          "{ \"%s\", egr::EagerUtils::TrySyncToVars(%s[%d]) },";
       ins_contents_str += paddle::string::Sprintf(
-          GRAD_INS_GRAD_CONTENT_TEMPLATE, grad_input_name, fwd_output_position);
-
+          GRAD_INS_GRAD_CONTENT_TEMPLATE, grad_input_name, hooked_grads,
+          fwd_output_position);
+      if (!backward_inplace_map.empty() &&
+          backward_inplace_map.count(grad_input_name)) {
+        process_backward_inplace = true;
+        const char* GRAD_INS_HOOKED_GRAD_TEMPLATE = "auto& %s = %s[%d][0];";
+        std::string hooked_grads_tensor_str = paddle::string::Sprintf(
+            GRAD_INS_HOOKED_GRAD_TEMPLATE, bwd_inplace_input_name, hooked_grads,
+            fwd_output_position);
+        const char* GRAD_INS_GRAD_TENSOR_TEMPLATE = "grads[%d][0]";
+        std::string grads_tensor_str = paddle::string::Sprintf(
+            GRAD_INS_GRAD_TENSOR_TEMPLATE, fwd_output_position);
+        generated_grad_function_body += paddle::string::Sprintf(
+            CHECK_BACKWARD_INPLACE_TEMPLATE, can_be_inplaced_name,
+            hooked_grads_tensor_str, bwd_inplace_input_name,
+            bwd_inplace_input_name, grad_input_name, bwd_inplace_input_name,
+            bwd_inplace_input_name, bwd_inplace_input_name,
+            bwd_inplace_input_name, grads_tensor_str, can_be_inplaced_name);
+      }
     } else {
       PADDLE_THROW(platform::errors::Fatal(
           "Detected mismatched slot names."
@@ -2112,7 +2215,6 @@ static std::string GenerateSingleOpBase(
   }
 
   VLOG(6) << "Generated Ins Map";
-
   // [Generation] Get Outs Map
   std::string outs_contents_str = "";
   for (auto iter : grad_outs) {
@@ -2161,9 +2263,12 @@ static std::string GenerateSingleOpBase(
         size_t grads_position = fwd_outputs_name_pos_map.at(fwd_name);
 
         const char* GRAD_OUTS_CONTENT_TEMPLATE =
-            "{ \"%s\", egr::EagerUtils::TrySyncToVars(hooked_grads[%d]) },";
+            " if((!out_metas[%d].empty()) && "
+            "(!(out_metas[%d][0].IsStopGradient()))){ \n %s.insert({ \"%s\", "
+            "egr::EagerUtils::TrySyncToVars(%s[%d])});} \n ";
         outs_contents_str += paddle::string::Sprintf(
-            GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name, grads_position);
+            GRAD_OUTS_CONTENT_TEMPLATE, grads_position, grads_position,
+            outs_name, grad_output_name, hooked_grads, grads_position);
 
       } else {
         if (dispensable_input_name_set.count(fwd_name) &&
@@ -2174,18 +2279,20 @@ static std::string GenerateSingleOpBase(
         if (duplicable_input_name_set.count(fwd_name) &&
             !is_op_base_per_duplicable_input) {
           const char* GRAD_OUTS_CONTENT_TEMPLATE =
-              "{ \"%s\", egr::EagerUtils::CreateVars( "
-              "this->OutputMeta()[%d].size() ) },";
+              " if(!out_metas[%d].empty()){ %s.insert({ \"%s\", "
+              "egr::EagerUtils::CreateVars(out_metas[%d].size())});} \n ";
           outs_contents_str += paddle::string::Sprintf(
-              GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name, fwd_input_position);
+              GRAD_OUTS_CONTENT_TEMPLATE, fwd_input_position, outs_name,
+              grad_output_name, fwd_input_position);
         } else {
           const char* GRAD_OUTS_CONTENT_TEMPLATE =
-              "{ \"%s\", "
+              " if((!out_metas[%d].empty()) && "
+              "(!(out_metas[%d][0].IsStopGradient()))){ %s.insert({ \"%s\", "
               "{std::make_shared<egr::EagerVariable>(egr::Controller::Instance("
-              ")."
-              "GenerateUniqueName())}},";
+              ").GenerateUniqueName())}});} \n ";
           outs_contents_str += paddle::string::Sprintf(
-              GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name);
+              GRAD_OUTS_CONTENT_TEMPLATE, fwd_input_position,
+              fwd_input_position, outs_name, grad_output_name);
         }
       }
     } else {
@@ -2195,16 +2302,15 @@ static std::string GenerateSingleOpBase(
           grad_output_name));
     }
   }
-  if (outs_contents_str.size() > 0)
-    outs_contents_str.pop_back();  // // Remove trailing ","
 
   const char* BWD_OUTS_MAP_TEMPLATE =
       "  std::map<std::string, "
-      "std::vector<std::shared_ptr<egr::EagerVariable>>> %s = { "
-      "%s };\n";
-  std::string outs_map_str = paddle::string::Sprintf(
-      BWD_OUTS_MAP_TEMPLATE, outs_name, outs_contents_str);
+      "std::vector<std::shared_ptr<egr::EagerVariable>>> %s;\n";
+  std::string outs_map_str =
+      paddle::string::Sprintf(BWD_OUTS_MAP_TEMPLATE, outs_name);
+
   generated_grad_function_body += outs_map_str;
+  generated_grad_function_body += outs_contents_str;
   generated_grad_function_body += "\n";
   for (auto iter : grad_outs) {
     const std::string& grad_output_name = iter.first;
@@ -2219,18 +2325,23 @@ static std::string GenerateSingleOpBase(
               !is_op_base_per_duplicable_input) {
             size_t fwd_input_position = fwd_inputs_name_pos_map.at(fwd_name);
             const char* DISPENSABLE_GRAD_OUTS_FWD_CONTENT_TEMPLATE =
-                "  if(%s.size() > 0) %s[\"%s\"] = egr::EagerUtils::CreateVars( "
-                "this->OutputMeta()[%d].size() );\n";
+                "  if((%s.size() > 0) && (!out_metas[%d].empty()) && "
+                "(!out_metas[%d][0].IsStopGradient())) %s[\"%s\"] = "
+                "egr::EagerUtils::CreateVars( "
+                "out_metas[%d].size() );\n";
             generated_grad_function_body += paddle::string::Sprintf(
                 DISPENSABLE_GRAD_OUTS_FWD_CONTENT_TEMPLATE, fwd_name, outs_name,
                 grad_output_name, fwd_input_position);
           } else {
+            size_t fwd_input_position = fwd_inputs_name_pos_map.at(fwd_name);
             const char* DISPENSABLE_GRAD_OUTS_FWD_CONTENT_TEMPLATE =
-                "  if(%s.defined()) %s[\"%s\"] = "
+                "  if(%s.defined() && (!out_metas[%d].empty()) && "
+                "(!out_metas[%d][0].IsStopGradient())) %s[\"%s\"] = "
                 "{std::make_shared<egr::EagerVariable>(egr::Controller::"
                 "Instance().GenerateUniqueName())};\n";
             generated_grad_function_body += paddle::string::Sprintf(
-                DISPENSABLE_GRAD_OUTS_FWD_CONTENT_TEMPLATE, fwd_name, outs_name,
+                DISPENSABLE_GRAD_OUTS_FWD_CONTENT_TEMPLATE, fwd_name,
+                fwd_input_position, fwd_input_position, outs_name,
                 grad_output_name);
           }
         }
@@ -2245,6 +2356,27 @@ static std::string GenerateSingleOpBase(
 
   VLOG(6) << "Generated Outs Map";
 
+  // [Generation] Process Backward Inplace
+  if (process_backward_inplace) {
+    const char* HANDLE_BACKWARD_INPLACE_BETWEEN_INPUT_AND_OUTPUT =
+        "  if (%s && %s.count(\"%s\") && %s.count(\"%s\")) {\n"
+        "    egr::EagerUtils::HandleViewBetweenInputAndOutput(%s[\"%s\"][0], "
+        "%s[\"%s\"][0]);\n"
+        "  };\n";
+    std::string backward_inplace_map_str = "";
+    for (auto iter : backward_inplace_map) {
+      std::string backward_inplace_input_name = iter.first;
+      std::string backward_inplace_output_name = iter.second;
+      backward_inplace_map_str += paddle::string::Sprintf(
+          HANDLE_BACKWARD_INPLACE_BETWEEN_INPUT_AND_OUTPUT,
+          can_be_inplaced_name, ins_name, backward_inplace_input_name,
+          outs_name, backward_inplace_output_name, ins_name,
+          backward_inplace_input_name, outs_name, backward_inplace_output_name);
+    }
+    generated_grad_function_body += backward_inplace_map_str;
+    VLOG(6) << "Process Backward Inplace";
+  }
+
   // [Generation] Get Attrs Map
   const char* ATTRS_TEMPLATE = "  auto& %s = this->attr_map_;\n";
   std::string grad_attrs_str =
@@ -2289,16 +2421,20 @@ static std::string GenerateSingleOpBase(
       size_t fwd_input_position = fwd_inputs_name_pos_map.at(fwd_name);
       if (!is_op_base_per_duplicable_input) {
         const char* BWD_OUTPUT_TEMPLATE =
-            "  outputs[%d] = egr::EagerUtils::GetOutputs(%s[\"%s\"]);\n";
+            " if (%s.find(\"%s\") != %s.end()) { outputs[%d] = "
+            "egr::EagerUtils::GetOutputs(%s[\"%s\"]); }\n";
         outputs_str += paddle::string::Sprintf(
-            BWD_OUTPUT_TEMPLATE, fwd_input_position, outs_name, grad_out_name);
+            BWD_OUTPUT_TEMPLATE, outs_name, grad_out_name, outs_name,
+            fwd_input_position, outs_name, grad_out_name);
       } else {
         const char* BWD_OUTPUT_TEMPLATE =
             "  "
+            "if (%s.find(\"%s\") != %s.end()) { "
             "outputs[0].emplace_back(egr::EagerUtils::GetOutputs(%s[\"%s\"])[0]"
-            ");\n";
+            "); }\n";
         outputs_str += paddle::string::Sprintf(BWD_OUTPUT_TEMPLATE, outs_name,
-                                               grad_out_name);
+                                               grad_out_name, outs_name,
+                                               outs_name, grad_out_name);
       }
       num_appended_outputs++;
     } else {
@@ -2317,9 +2453,11 @@ static std::string GenerateSingleOpBase(
 
     if (fwd_outputs_name_pos_map.count(fwd_name)) {
       const char* BWD_OUTPUT_TEMPLATE =
-          "  outputs[%d] = egr::EagerUtils::GetOutputs(%s[\"%s\"]);\n";
+          "  if (%s.find(\"%s\") != %s.end()) { outputs[%d] = "
+          "egr::EagerUtils::GetOutputs(%s[\"%s\"]); }\n";
       outputs_str += paddle::string::Sprintf(
-          BWD_OUTPUT_TEMPLATE, num_appended_outputs, outs_name, grad_out_name);
+          BWD_OUTPUT_TEMPLATE, outs_name, grad_out_name, outs_name,
+          num_appended_outputs, outs_name, grad_out_name);
       num_appended_outputs++;
     }
   }
@@ -2428,13 +2566,15 @@ static std::string GenerateGradNodeCCContents(
     const auto& grad_ins = op_base_info.GetGradIns();
     const auto& grad_outs = op_base_info.GetGradOuts();
     const auto& grad_attrs = op_base_info.GetGradAttrs();
+    const auto& backward_inplace_map = op_base_info.GetBackwardInplaceMap();
 
     const std::string& op_base_type = op_base_info.GetOpBaseType();
     generated_grad_function_body += GenerateSingleOpBase(
         fwd_op_type, op_base_type, fwd_inputs_name_pos_map,
         fwd_outputs_name_pos_map, in_vars, grad_ins_fwd_slotname_map,
         grad_ins_grad_slotname_map, grad_outs_slotname_map, grad_ins, grad_outs,
-        grad_attrs, is_op_base_per_duplicable_input, &outs_size);
+        grad_attrs, backward_inplace_map, is_op_base_per_duplicable_input,
+        &outs_size);
   }
 
   if (is_op_base_per_duplicable_input) {
@@ -2447,18 +2587,15 @@ static std::string GenerateGradNodeCCContents(
   }
 
   const char* BWD_RETURN_TEMPLATE =
-      "  paddle::small_vector<std::vector<paddle::experimental::Tensor>, "
-      "egr::kSlotSmallVectorSize> hooked_grads = "
-      "GradNode%s::ApplyGradientHooks(grads);\n"
+      "  const auto& out_metas = OutputMeta();\n"
       "  paddle::small_vector<std::vector<paddle::experimental::Tensor>, "
       "egr::kSlotSmallVectorSize> outputs(%d);\n"
       "  %s\n"
       "  if(NeedComplexToRealConversion()) "
       "HandleComplexGradToRealGrad(&outputs);\n"
       "  return outputs;\n";
-  generated_grad_function_body =
-      paddle::string::Sprintf(BWD_RETURN_TEMPLATE, fwd_op_type, in_vars.size(),
-                              generated_grad_function_body);
+  generated_grad_function_body = paddle::string::Sprintf(
+      BWD_RETURN_TEMPLATE, in_vars.size(), generated_grad_function_body);
 
   // [Generation] Get Full Grad Function
   const char* GRAD_FUNCTION_TEMPLATE =
@@ -2469,17 +2606,9 @@ static std::string GenerateGradNodeCCContents(
       "egr::kSlotSmallVectorSize>& grads, bool "
       "create_graph, bool is_new_grad) {\n"
       "%s"
-      "%s"
       "\n}";
-  std::string fill_zero_str = "";
-  if (ops_to_fill_zero_for_empty_grads.count(fwd_op_type)) {
-    fill_zero_str =
-        "egr::EagerUtils::FillZeroForEmptyGradInputs(&grads, "
-        "this->InputMeta());\n";
-  }
-  std::string grad_function_str =
-      paddle::string::Sprintf(GRAD_FUNCTION_TEMPLATE, fwd_op_type,
-                              fill_zero_str, generated_grad_function_body);
+  std::string grad_function_str = paddle::string::Sprintf(
+      GRAD_FUNCTION_TEMPLATE, fwd_op_type, generated_grad_function_body);
 
   VLOG(6) << "Generated returns";
 
@@ -2847,19 +2976,20 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
 
     auto& infer_inplace =
         paddle::framework::OpInfoMap::Instance().Get(op_type).infer_inplace_;
-    std::map<std::string, std::string> inplace_map;
+    std::map<std::string, std::string> forward_inplace_map;
     // Inplace Function Generator.
     // `sum` op has duplicate input. Don't consider adding inplace strategy
     // for `sum` in temporary.
     if (infer_inplace && !special_inplace_op_set.count(op_type)) {
       auto in_to_outs = infer_inplace(true);
       for (auto& inplace_pair : in_to_outs) {
-        inplace_map[inplace_pair.second] = inplace_pair.first;
+        forward_inplace_map[inplace_pair.second] = inplace_pair.first;
       }
 
       VLOG(6) << "-------- GenerateInplaceForwardFunctionContents -------";
       std::pair<std::string, std::string> inplace_body_and_declaration =
-          GenerateForwardFunctionContents(fwd_info, bwd_info, inplace_map);
+          GenerateForwardFunctionContents(fwd_info, bwd_info,
+                                          forward_inplace_map);
 
       fwd_function_str += inplace_body_and_declaration.first + "\n";
 
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
index 8467a6d7dfb6a..87b2ff986dc92 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
@@ -30,7 +30,9 @@
     "divide_double_grad", "log_double_grad", "elu_double_grad",
     "leaky_relu_double_grad", "sqrt_double_grad", "rsqrt_double_grad",
     "square_double_grad", "celu_double_grad", "pad_double_grad",
-    "pad3d_double_grad", "squeeze_double_grad", "unsqueeze_double_grad"
+    "pad3d_double_grad", "squeeze_double_grad", "unsqueeze_double_grad",
+    "instance_norm_double_grad", "conv3d_double_grad",
+    "depthwise_conv2d_grad_grad"
 ])
 
 # For API dispatch used at python-level
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index d8b909c3bacc1..d23d71b07626d 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -1404,7 +1404,7 @@ def GenerateNodeDefinition(self, next_grad_node_creation_str,
   const auto& out_metas = OutputMeta();
   paddle::small_vector<std::vector<paddle::experimental::Tensor>, egr::kSlotSmallVectorSize> returns({slot_num_bwd_outputs});
   for (int i = 0; i < {slot_num_bwd_outputs}; ++i) {{
-    returns[i].resize(out_metas[i].size());
+    out_metas[i].size() == 0 ? returns[i].resize(1) : returns[i].resize(out_metas[i].size());
   }}
 """
 
diff --git a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
index bcb9820419d0f..551262d259e08 100644
--- a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
@@ -250,7 +250,7 @@ TEST(EagerUtils, GetGradAccumulationNode) {
   ASSERT_ANY_THROW(egr::EagerUtils::GetGradAccumulationNode(t0));
 }
 
-TEST(EagerUtils, FillZeroForEmptyGradInputs) {
+TEST(EagerUtils, FillZeroForEmptyOptionalGradInput) {
   paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                        egr::kSlotSmallVectorSize>
       grads = {std::vector<paddle::experimental::Tensor>(1)};
@@ -263,7 +263,7 @@ TEST(EagerUtils, FillZeroForEmptyGradInputs) {
   slot_metas[0][0].SetTensorMeta(tensor_meta);
   slot_metas[0][0].SetPlace(phi::CPUPlace());
 
-  EagerUtils::FillZeroForEmptyGradInputs(&grads, slot_metas);
+  EagerUtils::FillZeroForEmptyOptionalGradInput(&grads[0], slot_metas[0]);
   eager_test::CompareTensorWithValue<float>(grads[0][0], 0.0);
 }
 
diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h
index fe1cdefb7d572..5a730e4dbf164 100644
--- a/paddle/fluid/eager/to_static/run_program_op_node.h
+++ b/paddle/fluid/eager/to_static/run_program_op_node.h
@@ -379,8 +379,8 @@ class GradNodeRunProgram : public egr::GradNodeBase {
                           "The hooked_grads.size() of RunProgramGradOp should "
                           "be equal to 1."));
 
-    egr::EagerUtils::FillZeroForEmptyGradInputs(&hooked_grads,
-                                                this->InputMeta());
+    egr::EagerUtils::FillZeroForEmptyOptionalGradInput(&hooked_grads[0],
+                                                       this->InputMeta()[0]);
     VLOG(3) << "hooked_grads[0].size() : " << hooked_grads[0].size();
     std::vector<paddle::experimental::Tensor> x_grad;
     std::vector<paddle::experimental::Tensor> params_grad;
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index 9ccd91ca65733..7d9554c52eb6c 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -467,26 +467,16 @@ std::shared_ptr<egr::GradNodeBase> EagerUtils::GetGradAccumulationNode(
   }
 }
 
-void EagerUtils::FillZeroForEmptyGradInputs(
-    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
-                         kSlotSmallVectorSize>* in_grads,
-    const paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>&
-        grad_in_metas) {
+void EagerUtils::FillZeroForEmptyOptionalGradInput(
+    std::vector<paddle::experimental::Tensor>* in_grads,
+    const std::vector<GradSlotMeta>& grad_in_metas) {
   for (size_t i = 0; i < in_grads->size(); i++) {
-    for (size_t j = 0; j < (*in_grads)[i].size(); j++) {
-      paddle::experimental::Tensor& grad = (*in_grads)[i][j];
-      if (!grad.initialized()) {
-        const GradSlotMeta& grad_in_meta = grad_in_metas[i][j];
-        PADDLE_ENFORCE(
-            grad_in_meta.HasTensorMeta(),
-            paddle::platform::errors::Fatal(
-                "Unable to fill empty grad inputs due to empty GradSlotMeta"));
-        const auto& tensor_meta = grad_in_meta.GetTensorMeta();
-        auto tensor_with_zero = paddle::experimental::full(
-            phi::vectorize(tensor_meta.dims), 0.0, tensor_meta.dtype,
-            grad_in_meta.GetPlace());
-        grad.set_impl(tensor_with_zero.impl());
-      }
+    paddle::experimental::Tensor& grad = (*in_grads)[i];
+    if (!grad.initialized() && grad_in_metas[i].HasTensorMeta()) {
+      auto tensor_with_zero = paddle::experimental::full(
+          phi::vectorize(grad_in_metas[i].GetTensorMeta().dims), 0.0,
+          grad_in_metas[i].GetTensorMeta().dtype, grad_in_metas[i].GetPlace());
+      grad.set_impl(tensor_with_zero.impl());
     }
   }
 }
diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h
index 63baebca53c37..c6389e998315c 100644
--- a/paddle/fluid/eager/utils.h
+++ b/paddle/fluid/eager/utils.h
@@ -236,11 +236,9 @@ class EagerUtils {
   /**
     * Fill Zero
     * **/
-  static void FillZeroForEmptyGradInputs(
-      paddle::small_vector<std::vector<paddle::experimental::Tensor>,
-                           kSlotSmallVectorSize>* out_grads,
-      const paddle::small_vector<std::vector<GradSlotMeta>,
-                                 kSlotSmallVectorSize>& grad_out_metas);
+  static void FillZeroForEmptyOptionalGradInput(
+      std::vector<paddle::experimental::Tensor>* in_grads,
+      const std::vector<GradSlotMeta>& grad_in_metas);
   static void FillZeroForEmptyGradInput(paddle::experimental::Tensor* in_grad,
                                         const GradSlotMeta& grad_in_meta);
   static void FillZeroForEmptyOptionalGradInput(
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index fff78dd872c99..94753f8dd38e0 100644
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -120,6 +120,7 @@ message BuildStrategy {
   optional bool fix_op_run_order = 13 [ default = false ];
   optional bool allow_cuda_graph_capture = 14 [ default = false ];
   optional int32 reduce_strategy = 15 [ default = 0 ];
+  optional bool fuse_gemm_epilogue = 16 [ default = false ];
 }
 
 message ExecutionStrategy {
diff --git a/paddle/fluid/framework/ir/delete_fill_constant_op_pass.cc b/paddle/fluid/framework/ir/delete_fill_constant_op_pass.cc
index e86bb2926b640..79a06572d1427 100644
--- a/paddle/fluid/framework/ir/delete_fill_constant_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_fill_constant_op_pass.cc
@@ -30,13 +30,19 @@ void FillConstData(LoDTensor* out_t, T value) {
 void DeleteFillConstantOpPass::ApplyImpl(ir::Graph* graph) const {
   FusePassBase::Init("delete_fill_constant_op_pass", graph);
   GraphPatternDetector detector;
-  auto fill_constant_op = detector.mutable_pattern()
-                              ->NewNode("fill_constant")
-                              ->assert_is_op("fill_constant")
-                              ->assert_is_not_op_input("ValueTensor")
-                              ->assert_is_not_op_input("str_value")
-                              ->assert_is_not_op_input("ShapeTensor")
-                              ->assert_is_not_op_input("ShapeTensorList");
+  auto fill_constant_op =
+      detector.mutable_pattern()
+          ->NewNode("fill_constant")
+          ->assert_is_op("fill_constant")
+          ->assert_is_not_op_input("ValueTensor")
+          ->assert_is_not_op_input("str_value")
+          ->assert_is_not_op_input("ShapeTensor")
+          ->assert_is_not_op_input("ShapeTensorList")
+          ->assert_more([&](Node* node) {
+            return node->Op()
+                       ->GetAttrIfExists<std::vector<int64_t>>("shape")
+                       .size() == 1;
+          });
   auto fill_constant_out =
       detector.mutable_pattern()
           ->NewNode("fill_constant_out")
diff --git a/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc
index f48224cbdc24f..b72a63d37853c 100644
--- a/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc
@@ -22,6 +22,12 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+static void GetTransposeAttrsFromOp(const OpDesc &op, bool *trans_x,
+                                    bool *trans_y) {
+  *trans_x = BOOST_GET_CONST(bool, op.GetAttr("trans_x"));
+  *trans_y = BOOST_GET_CONST(bool, op.GetAttr("trans_y"));
+}
+
 void FuseGemmEpiloguePass::ApplyImpl(ir::Graph *graph) const {
   EpiloguePassActivationCache cache;
 
@@ -75,6 +81,9 @@ ir::Graph *FuseGemmEpiloguePass::FuseLinearFwd(ir::Graph *graph,
     if (!IsGemmFromLinear_(matmul_x_shape, matmul_w_shape, matmul_op_desc))
       return;
 
+    bool trans_x, trans_y;
+    GetTransposeAttrsFromOp(*matmul_op_desc, &trans_x, &trans_y);
+
     OpDesc fused_gemm_epilogue_op_desc(matmul_op->Op()->Block());
     std::string activation = "none";
     fused_gemm_epilogue_op_desc.SetType("fused_gemm_epilogue");
@@ -85,6 +94,8 @@ ir::Graph *FuseGemmEpiloguePass::FuseLinearFwd(ir::Graph *graph,
     fused_gemm_epilogue_op_desc.SetAttr("activation", activation);
     fused_gemm_epilogue_op_desc.SetAttr("op_role",
                                         matmul_op_desc->GetAttr("op_role"));
+    fused_gemm_epilogue_op_desc.SetAttr("trans_x", trans_x);
+    fused_gemm_epilogue_op_desc.SetAttr("trans_y", trans_y);
     auto gemm_epilogue_node = g->CreateOpNode(&fused_gemm_epilogue_op_desc);
 
     IR_NODE_LINK_TO(subgraph.at(x), gemm_epilogue_node);
@@ -154,6 +165,9 @@ ir::Graph *FuseGemmEpiloguePass::FuseLinearActFwd(
 
     auto activation = act_op->Op()->Type();
 
+    bool trans_x, trans_y;
+    GetTransposeAttrsFromOp(*matmul_op_desc, &trans_x, &trans_y);
+
     OpDesc fused_gemm_epilogue_op_desc(matmul_op->Op()->Block());
     fused_gemm_epilogue_op_desc.SetType("fused_gemm_epilogue");
     fused_gemm_epilogue_op_desc.SetInput("X", {subgraph.at(x)->Name()});
@@ -163,6 +177,8 @@ ir::Graph *FuseGemmEpiloguePass::FuseLinearActFwd(
     fused_gemm_epilogue_op_desc.SetAttr("activation", activation);
     fused_gemm_epilogue_op_desc.SetAttr("op_role",
                                         matmul_op_desc->GetAttr("op_role"));
+    fused_gemm_epilogue_op_desc.SetAttr("trans_x", trans_x);
+    fused_gemm_epilogue_op_desc.SetAttr("trans_y", trans_y);
 
     auto gemm_epilogue_node = g->CreateOpNode(&fused_gemm_epilogue_op_desc);
 
@@ -274,6 +290,9 @@ ir::Graph *FuseGemmEpiloguePass::FuseLinearBwd(ir::Graph *graph,
                            matmul_grad_op_desc))
       return;
 
+    bool trans_x, trans_y;
+    GetTransposeAttrsFromOp(*matmul_grad_op_desc, &trans_x, &trans_y);
+
     OpDesc fused_gemm_epilogue_grad_op_desc(ele_add_grad_op->Op()->Block());
     std::string activation_grad = "none";
     fused_gemm_epilogue_grad_op_desc.SetType("fused_gemm_epilogue_grad");
@@ -292,6 +311,8 @@ ir::Graph *FuseGemmEpiloguePass::FuseLinearBwd(ir::Graph *graph,
                                              activation_grad);
     fused_gemm_epilogue_grad_op_desc.SetAttr(
         "op_role", matmul_grad_op_desc->GetAttr("op_role"));
+    fused_gemm_epilogue_grad_op_desc.SetAttr("trans_x", trans_x);
+    fused_gemm_epilogue_grad_op_desc.SetAttr("trans_y", trans_y);
 
     auto gemm_epilogue_grad_node =
         g->CreateOpNode(&fused_gemm_epilogue_grad_op_desc);
@@ -394,6 +415,8 @@ ir::Graph *FuseGemmEpiloguePass::FuseLinearActBwd(
 
     auto activation_grad = act_grad_op->Op()->Type();
 
+    bool trans_x, trans_y;
+    GetTransposeAttrsFromOp(*matmul_grad_op_desc, &trans_x, &trans_y);
     OpDesc fused_gemm_epilogue_grad_op_desc(ele_add_grad_op->Op()->Block());
     fused_gemm_epilogue_grad_op_desc.SetType("fused_gemm_epilogue_grad");
     fused_gemm_epilogue_grad_op_desc.SetInput("DOut",
@@ -410,6 +433,8 @@ ir::Graph *FuseGemmEpiloguePass::FuseLinearActBwd(
                                              activation_grad);
     fused_gemm_epilogue_grad_op_desc.SetAttr(
         "op_role", matmul_grad_op_desc->GetAttr("op_role"));
+    fused_gemm_epilogue_grad_op_desc.SetAttr("trans_x", trans_x);
+    fused_gemm_epilogue_grad_op_desc.SetAttr("trans_y", trans_y);
 
     auto gemm_epilogue_grad_node =
         g->CreateOpNode(&fused_gemm_epilogue_grad_op_desc);
@@ -456,10 +481,6 @@ bool FuseGemmEpiloguePass::IsGemmFromLinear_(
       if (tmp_vec.size() > 0) return false;
     }
   }
-  if (BOOST_GET_CONST(bool, matmul_v2_op->GetAttr("trans_x")) ||
-      BOOST_GET_CONST(bool, matmul_v2_op->GetAttr("trans_y")))
-    return false;
-
   return true;
 }
 
diff --git a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
index 55470db312f81..63e402cb52983 100644
--- a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
@@ -489,14 +489,6 @@ void QuantDequantMkldnnPass::UpdateActivations(ir::Graph* graph) const {
         std::string activation;
         if (op_desc->GetAttrIfExists<bool>("fuse_relu")) {
           activation = "relu";
-        } else if (op_desc->GetAttrIfExists<bool>("fuse_brelu")) {
-          activation = "relu6";
-          float alpha = 6.0;
-          if (op_desc->HasAttr("fuse_brelu_threshold")) {
-            alpha = BOOST_GET_CONST(float,
-                                    op_desc->GetAttr("fuse_brelu_threshold"));
-          }
-          op_node->Op()->SetAttr("fuse_alpha", alpha);
         }
         op_node->Op()->SetAttr("fuse_activation", activation);
       }
diff --git a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc
index 0fc458723ffe4..60d661f7740d0 100644
--- a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc
@@ -91,6 +91,10 @@ void ScaleMatmulFusePass::ApplyImpl(ir::Graph* graph) const {
     GET_IR_NODE_FROM_SUBGRAPH(scale_out, scale_out, scale_matmul_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul_op, scale_matmul_pattern);
 
+    if ((scale_out->outputs).size() != 1) {
+      return;
+    }
+
     if (scale_op->Op()->GetAttrIfExists<float>("bias") == 0.0) {
       auto matmul_alpha = matmul_op->Op()->GetAttrIfExists<float>("alpha");
       auto scale_scale = scale_op->Op()->GetAttrIfExists<float>("scale");
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index afd1bf338c45e..69f14d7903c0b 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1116,6 +1116,21 @@ class RuntimeInferShapeContext : public InferShapeContext {
   const RuntimeContext& ctx_;
 };
 
+struct OperatorWithKernel::CacheImpl {
+  explicit CacheImpl(phi::KernelContext* kernel_ctx,
+                     RuntimeInferShapeContext* infer_shape_ctx)
+      : kernel_ctx_(kernel_ctx), infer_shape_ctx_(infer_shape_ctx) {}
+
+  phi::KernelContext* getKernelContext() { return kernel_ctx_.get(); }
+  RuntimeInferShapeContext* getRuntimeInferShapeContext() {
+    return infer_shape_ctx_.get();
+  }
+
+ private:
+  std::unique_ptr<phi::KernelContext> kernel_ctx_;
+  std::unique_ptr<RuntimeInferShapeContext> infer_shape_ctx_;
+};
+
 static void CheckTensorNANOrInf(const std::string& op_type,
                                 const std::string& name,
                                 const framework::Tensor& tensor) {
@@ -2323,6 +2338,8 @@ Scope* OperatorWithKernel::PreparePhiData(
       Tensor out;
       framework::TensorCopySync(*tensor_in, expected_place, &out);
       SetTensorToVariable(*var, out, trans_var);
+
+      need_prepare_phi_data_ = true;
     }
   }
 
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 2e00e07535b1d..2efa2e4bd8a75 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -698,6 +698,7 @@ class OperatorWithKernel : public OperatorBase {
   mutable std::unique_ptr<RuntimeContext> runtime_ctx_;
   mutable const Scope* pre_scope_ = nullptr;
   mutable bool need_prepare_data_ = true;
+  mutable bool need_prepare_phi_data_ = false;
   mutable bool enable_cache_runtime_context_ = false;
   mutable bool all_kernels_must_compute_runtime_shape_ = false;
   mutable std::mutex cache_update_mutex_;
@@ -710,6 +711,9 @@ class OperatorWithKernel : public OperatorBase {
   mutable std::unique_ptr<phi::KernelSignature> kernel_signature_;
   mutable std::unique_ptr<phi::Kernel> pt_kernel_;
   mutable std::unique_ptr<phi::ArgumentMappingFn> arg_map_fn_;
+
+  struct CacheImpl;
+  mutable CacheImpl* impl_{nullptr};
 };
 
 extern bool OpSupportGPU(const std::string& op_type);
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index b2d8afaa7b49c..aafbe57e05ff2 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -273,6 +273,11 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) {
     if (pass->Type() != "graph_viz_pass" && !disable_logs_) {
       PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass->Type());
     }
+    // delete_fill_constant_op_pass is not apply under trt dynamic shape
+    if (pass->Type() == "delete_fill_constant_op_pass") {
+      bool use_dynamic = pass->Get<bool>("with_dynamic_shape");
+      if (use_dynamic) continue;
+    }
     graph.reset(pass->Apply(graph.release()));
   }
   return graph;
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index adc3fc46f72ac..735e1b7be4c1f 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -633,11 +633,6 @@ void AnalysisConfig::Update() {
           (pass == "conv_bn_fuse_pass")) {
         continue;
       }
-      // delete_fill_constant_op_pass is not used under trt dynamic shape
-      if ((!min_input_shape_.empty() || trt_tuned_dynamic_shape_) &&
-          pass == "delete_fill_constant_op_pass") {
-        continue;
-      }
       pass_builder()->AppendPass(pass);
     }
   }
diff --git a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu
index 4e6b82d2dc146..0a6d24f90722e 100644
--- a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu
@@ -56,8 +56,6 @@ SlicePlugin::SlicePlugin(std::vector<int> starts, std::vector<int> ends,
                          std::vector<int> axes, bool with_fp16)
     : starts_(starts), ends_(ends), axes_(axes) {
   with_fp16_ = with_fp16;
-  cudaEventCreate(&copy_event_);
-  cudaStreamCreate(&copy_stream_);
 }
 
 SlicePlugin::SlicePlugin(void const *serial_data, size_t serial_length) {
@@ -66,15 +64,10 @@ SlicePlugin::SlicePlugin(void const *serial_data, size_t serial_length) {
   DeserializeValue(&serial_data, &serial_length, &ends_);
   DeserializeValue(&serial_data, &serial_length, &axes_);
   DeserializeValue(&serial_data, &serial_length, &with_fp16_);
-  cudaEventCreate(&copy_event_);
-  cudaStreamCreate(&copy_stream_);
+  DeserializeValue(&serial_data, &serial_length, &offset_info_);
 }
 
-SlicePlugin::~SlicePlugin() {
-  cudaStreamDestroy(copy_stream_);
-  cudaEventDestroy(copy_event_);
-  cudaFree(offset_temp_data_);
-}
+SlicePlugin::~SlicePlugin() { cudaFree(offset_temp_data_); }
 
 SlicePlugin *SlicePlugin::clone() const TRT_NOEXCEPT {
   return new SlicePlugin(starts_, ends_, axes_, with_fp16_);
@@ -159,11 +152,7 @@ int SlicePlugin::enqueue(int batch_size, const void *const *inputs,
   }
 
   cudaMemcpyAsync(offset_temp_data_, offset_info.data(),
-                  sizeof(int) * 3 * num_dims, cudaMemcpyHostToDevice,
-                  copy_stream_);
-
-  cudaEventRecord(copy_event_, copy_stream_);
-  cudaStreamWaitEvent(stream, copy_event_, 0);
+                  sizeof(int) * 3 * num_dims, cudaMemcpyHostToDevice, stream);
 
   int threads = 256;
   int blocks = (out_num + threads - 1) / threads;
@@ -190,7 +179,7 @@ int SlicePlugin::enqueue(int batch_size, const void *const *inputs,
 size_t SlicePlugin::getSerializationSize() const TRT_NOEXCEPT {
   return getBaseSerializationSize() + SerializedSize(starts_) +
          SerializedSize(ends_) + SerializedSize(axes_) +
-         SerializedSize(with_fp16_);
+         SerializedSize(with_fp16_) + SerializedSize(offset_info_);
 }
 
 void SlicePlugin::serialize(void *buffer) const TRT_NOEXCEPT {
@@ -199,6 +188,7 @@ void SlicePlugin::serialize(void *buffer) const TRT_NOEXCEPT {
   SerializeValue(&buffer, ends_);
   SerializeValue(&buffer, axes_);
   SerializeValue(&buffer, with_fp16_);
+  SerializeValue(&buffer, offset_info_);
 }
 
 // Dynamic Plugin below.
@@ -209,8 +199,6 @@ SlicePluginDynamic::SlicePluginDynamic(std::vector<int> starts,
                                        bool with_fp16)
     : starts_(starts), ends_(ends), axes_(axes), decrease_axis_(decrease_axis) {
   with_fp16_ = with_fp16;
-  cudaEventCreate(&copy_event_);
-  cudaStreamCreate(&copy_stream_);
 }
 
 SlicePluginDynamic::SlicePluginDynamic(void const *serialData,
@@ -220,13 +208,10 @@ SlicePluginDynamic::SlicePluginDynamic(void const *serialData,
   DeserializeValue(&serialData, &serialLength, &axes_);
   DeserializeValue(&serialData, &serialLength, &decrease_axis_);
   DeserializeValue(&serialData, &serialLength, &with_fp16_);
-  cudaEventCreate(&copy_event_);
-  cudaStreamCreate(&copy_stream_);
+  DeserializeValue(&serialData, &serialLength, &offset_info_);
 }
 
 void SlicePluginDynamic::destroy() TRT_NOEXCEPT {
-  cudaStreamDestroy(copy_stream_);
-  cudaEventDestroy(copy_event_);
   cudaFree(offset_temp_data_);
   delete this;
 }
@@ -236,7 +221,7 @@ int SlicePluginDynamic::initialize() TRT_NOEXCEPT { return 0; }
 size_t SlicePluginDynamic::getSerializationSize() const TRT_NOEXCEPT {
   size_t size = SerializedSize(starts_) + SerializedSize(ends_) +
                 SerializedSize(axes_) + SerializedSize(decrease_axis_) +
-                SerializedSize(with_fp16_);
+                SerializedSize(with_fp16_) + SerializedSize(offset_info_);
 
   return size;
 }
@@ -247,6 +232,7 @@ void SlicePluginDynamic::serialize(void *buffer) const TRT_NOEXCEPT {
   SerializeValue(&buffer, axes_);
   SerializeValue(&buffer, decrease_axis_);
   SerializeValue(&buffer, with_fp16_);
+  SerializeValue(&buffer, offset_info_);
 }
 
 nvinfer1::DimsExprs SlicePluginDynamic::getOutputDimensions(
@@ -361,23 +347,19 @@ int SlicePluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
     offsets[axes_[i]] = starts_[i];
   }
 
-  std::vector<int> offset_info;
+  offset_info_.resize(num_dims * 3);
   for (size_t i = 0; i < num_dims; ++i) {
-    offset_info.push_back(offsets[i]);
-    offset_info.push_back(extends[i]);
-    offset_info.push_back(seg_offsets[i]);
+    offset_info_[i * 3 + 0] = offsets[i];
+    offset_info_[i * 3 + 1] = extends[i];
+    offset_info_[i * 3 + 2] = seg_offsets[i];
   }
 
   if (offset_temp_data_ == nullptr) {
     cudaMalloc(&offset_temp_data_, 3 * num_dims * sizeof(int));
   }
 
-  cudaMemcpyAsync(offset_temp_data_, offset_info.data(),
-                  sizeof(int) * 3 * num_dims, cudaMemcpyHostToDevice,
-                  copy_stream_);
-
-  cudaEventRecord(copy_event_, copy_stream_);
-  cudaStreamWaitEvent(stream, copy_event_, 0);
+  cudaMemcpyAsync(offset_temp_data_, offset_info_.data(),
+                  sizeof(int) * 3 * num_dims, cudaMemcpyHostToDevice, stream);
 
   int threads = 256;
   int blocks = (out_num + threads - 1) / threads;
diff --git a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h
index 4c07f0be36864..6b50a52df1fe5 100644
--- a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h
@@ -64,8 +64,7 @@ class SlicePlugin : public PluginTensorRT {
   std::vector<int> ends_;
   std::vector<int> axes_;
   int* offset_temp_data_{nullptr};
-  cudaEvent_t copy_event_;
-  cudaStream_t copy_stream_;
+  std::vector<int> offset_info_;
 };
 
 class SlicePluginCreator : public TensorRTPluginCreator {
@@ -144,8 +143,7 @@ class SlicePluginDynamic : public DynamicPluginTensorRT {
   std::vector<int> axes_;
   int decrease_axis_;
   int* offset_temp_data_{nullptr};
-  cudaEvent_t copy_event_;
-  cudaStream_t copy_stream_;
+  std::vector<int> offset_info_;
 };
 
 class SlicePluginDynamicCreator : public TensorRTPluginCreator {
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 99152607158eb..46e1a500e4870 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -931,7 +931,13 @@ class AllocatorFacadePrivate {
 
   void WrapStatAllocator() {
     for (auto& pair : allocators_) {
-      pair.second = std::make_shared<StatAllocator>(pair.second);
+      // Now memory stats is only supported for CPU and GPU
+      const platform::Place& place = pair.first;
+      if (platform::is_cpu_place(place) ||
+          platform::is_cuda_pinned_place(place) ||
+          platform::is_gpu_place(place)) {
+        pair.second = std::make_shared<StatAllocator>(pair.second);
+      }
     }
   }
 
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc
index 276c6bb0e69b8..5e5aea6dab2cc 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/pinned_allocator.h"
-
+#include "paddle/fluid/memory/stats.h"
 namespace paddle {
 namespace memory {
 namespace allocation {
@@ -24,6 +24,7 @@ void CPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) {
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeHost(allocation->ptr()));
 #endif
+  HOST_MEMORY_STAT_UPDATE(Reserved, 0, -allocation->size());
   delete allocation;
 }
 phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
@@ -33,6 +34,7 @@ phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable));
 #endif
+  HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
   return new Allocation(ptr, size, platform::CUDAPinnedPlace());
 }
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/stat_allocator.h b/paddle/fluid/memory/allocation/stat_allocator.h
index 68209bbaabeca..8b54b961596c2 100644
--- a/paddle/fluid/memory/allocation/stat_allocator.h
+++ b/paddle/fluid/memory/allocation/stat_allocator.h
@@ -45,11 +45,13 @@ class StatAllocator : public Allocator {
     phi::Allocator::AllocationPtr allocation =
         underlying_allocator_->Allocate(size);
 
-    if (platform::is_cpu_place(allocation->place())) {
-      HOST_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
+    const platform::Place& place = allocation->place();
+    if (platform::is_cpu_place(place) ||
+        platform::is_cuda_pinned_place(place)) {
+      HOST_MEMORY_STAT_UPDATE(Allocated, place.GetDeviceId(),
                               allocation->size());
     } else {
-      DEVICE_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
+      DEVICE_MEMORY_STAT_UPDATE(Allocated, place.GetDeviceId(),
                                 allocation->size());
     }
     return allocation.release();
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index 06038804e6efe..e1077d66c54ec 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -211,6 +211,7 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
   if (result == gpuSuccess) {
     *index = 1;  // PINNED memory
     cuda_pinnd_alloc_size_ += size;
+    HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
     return p;
   } else {
     LOG(WARNING) << "cudaHostAlloc failed.";
@@ -255,6 +256,7 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
             err));
   }
 #endif
+  HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size);
 }
 
 bool CUDAPinnedAllocator::UseGpu() const { return false; }
diff --git a/paddle/fluid/operators/abs_op_mlu.cc b/paddle/fluid/operators/abs_op_mlu.cc
new file mode 100644
index 0000000000000..3a3a484ea775e
--- /dev/null
+++ b/paddle/fluid/operators/abs_op_mlu.cc
@@ -0,0 +1,75 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the Licnse. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class AbsMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+
+    output->mutable_data<T>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc input_desc(*input);
+    MLUCnnlTensorDesc output_desc(*output);
+
+    MLUCnnl::Abs(ctx, input_desc.get(), GetBasePtr(input), output_desc.get(),
+                 GetBasePtr(output));
+  }
+};
+
+template <typename T>
+class AbsGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    dx->mutable_data<T>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc input_desc(*x);
+    MLUCnnlOpTensorDesc mul_op_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(),
+                                    CNNL_NOT_PROPAGATE_NAN);
+
+    Tensor sign_x;
+    sign_x.mutable_data<T>(x->dims(), ctx.GetPlace());
+
+    MLUCnnl::Sign(ctx, input_desc.get(), GetBasePtr(x), input_desc.get(),
+                  GetBasePtr(&sign_x));
+    MLUCnnl::OpTensor(ctx, mul_op_desc.get(), input_desc.get(),
+                      GetBasePtr(&sign_x), input_desc.get(), GetBasePtr(dout),
+                      input_desc.get(), GetBasePtr(dx), ToCnnlDataType<T>());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(abs, ops::AbsMLUKernel<float>,
+                       ops::AbsMLUKernel<plat::float16>);
+
+REGISTER_OP_MLU_KERNEL(abs_grad, ops::AbsGradMLUKernel<float>,
+                       ops::AbsGradMLUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/arg_max_op_npu.cc b/paddle/fluid/operators/arg_max_op_npu.cc
index 680183b6adf40..5c6b276c0172a 100644
--- a/paddle/fluid/operators/arg_max_op_npu.cc
+++ b/paddle/fluid/operators/arg_max_op_npu.cc
@@ -34,11 +34,18 @@ struct VisitDataArgNPUMaxFunctor {
     out.template mutable_data<Tout>(ctx.GetPlace());
     auto axis = ctx.Attr<int64_t>("axis");
     auto dtype = ctx.Attr<int>("dtype");
+    const bool& flatten = ctx.Attr<bool>("flatten");
+
+    Tensor transformed_x(x.type());
+    transformed_x.ShareDataWith(x);
+    if (flatten) {
+      transformed_x.Resize(phi::make_ddim({x.numel()}));
+    }
 
     auto stream = ctx.template device_context<NPUDeviceContext>().stream();
     NpuOpRunner runner;
     runner.SetType("ArgMaxV2")
-        .AddInput(x)
+        .AddInput(transformed_x)
         .AddInput(std::vector<int64_t>{axis})
         .AddOutput(out)
         .AddAttrDataType("dtype", dtype)
diff --git a/paddle/fluid/operators/collective/alltoall_op.cu.cc b/paddle/fluid/operators/collective/alltoall_op.cu.cc
index 0e0ea72208488..bb498047a50b0 100644
--- a/paddle/fluid/operators/collective/alltoall_op.cu.cc
+++ b/paddle/fluid/operators/collective/alltoall_op.cu.cc
@@ -91,6 +91,9 @@ namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(alltoall, ops::AllToAllOpCUDAKernel<float>,
                         ops::AllToAllOpCUDAKernel<double>,
+#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000
+                        ops::AllToAllOpCUDAKernel<plat::bfloat16>,
+#endif
                         ops::AllToAllOpCUDAKernel<int>,
                         ops::AllToAllOpCUDAKernel<int64_t>,
                         ops::AllToAllOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_allgather_op.cu.cc b/paddle/fluid/operators/collective/c_allgather_op.cu.cc
index 0d97ffa96dc5c..62ed916d6e08c 100644
--- a/paddle/fluid/operators/collective/c_allgather_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op.cu.cc
@@ -90,6 +90,9 @@ namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(c_allgather, ops::CAllGatherOpCUDAKernel<float>,
                         ops::CAllGatherOpCUDAKernel<double>,
+#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000
+                        ops::CAllGatherOpCUDAKernel<plat::bfloat16>,
+#endif
                         ops::CAllGatherOpCUDAKernel<int>,
                         ops::CAllGatherOpCUDAKernel<int64_t>,
                         ops::CAllGatherOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
index 8fe7fce21e465..565633c2e7b2d 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
@@ -19,6 +19,9 @@ namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(
     c_allreduce_sum, ops::CAllReduceOpCUDAKernel<ops::kRedSum, float>,
+#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000
+    ops::CAllReduceOpCUDAKernel<ops::kRedSum, plat::bfloat16>,
+#endif
     ops::CAllReduceOpCUDAKernel<ops::kRedSum, double>,
     ops::CAllReduceOpCUDAKernel<ops::kRedSum, int>,
     ops::CAllReduceOpCUDAKernel<ops::kRedSum, int64_t>,
diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
index eeae16a0d71f3..478dc85914964 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
@@ -98,6 +98,9 @@ namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(c_broadcast, ops::CBroadcastOpCUDAKernel<float>,
                         ops::CBroadcastOpCUDAKernel<double>,
+#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000
+                        ops::CBroadcastOpCUDAKernel<plat::bfloat16>,
+#endif
                         ops::CBroadcastOpCUDAKernel<int>,
                         ops::CBroadcastOpCUDAKernel<int64_t>,
                         ops::CBroadcastOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
index 9b05e940d4f60..fda192c45e779 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
@@ -76,6 +76,9 @@ namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(c_reducescatter, ops::CReduceScatterOpCUDAKernel<float>,
                         ops::CReduceScatterOpCUDAKernel<double>,
+#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000
+                        ops::CReduceScatterOpCUDAKernel<plat::bfloat16>,
+#endif
                         ops::CReduceScatterOpCUDAKernel<int>,
                         ops::CReduceScatterOpCUDAKernel<int64_t>,
                         ops::CReduceScatterOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/recv_v2_op.cu.cc b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
index f7a2e198db938..67c30438869b1 100644
--- a/paddle/fluid/operators/collective/recv_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
@@ -224,6 +224,9 @@ namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(recv_v2, ops::RecvOpV2CUDAKernel<float>,
                         ops::RecvOpV2CUDAKernel<double>,
+#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000
+                        ops::RecvOpV2CUDAKernel<plat::bfloat16>,
+#endif
                         ops::RecvOpV2CUDAKernel<int>,
                         ops::RecvOpV2CUDAKernel<int64_t>,
                         ops::RecvOpV2CUDAKernel<int8_t>,
diff --git a/paddle/fluid/operators/collective/send_v2_op.cu.cc b/paddle/fluid/operators/collective/send_v2_op.cu.cc
index 8878b7c3449b9..cfb3a11513a21 100644
--- a/paddle/fluid/operators/collective/send_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/send_v2_op.cu.cc
@@ -197,6 +197,9 @@ namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(send_v2, ops::SendOpV2CUDAKernel<float>,
                         ops::SendOpV2CUDAKernel<double>,
+#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000
+                        ops::SendOpV2CUDAKernel<plat::bfloat16>,
+#endif
                         ops::SendOpV2CUDAKernel<int>,
                         ops::SendOpV2CUDAKernel<int64_t>,
                         ops::SendOpV2CUDAKernel<int8_t>,
diff --git a/paddle/fluid/operators/compat/conv2d.pbtxt b/paddle/fluid/operators/compat/conv2d.pbtxt
index ca07d4a36ff3c..8de061a3cc2f6 100644
--- a/paddle/fluid/operators/compat/conv2d.pbtxt
+++ b/paddle/fluid/operators/compat/conv2d.pbtxt
@@ -77,14 +77,6 @@ extra {
     name: "fuse_relu"
     type: BOOLEAN
   }
-  attrs {
-    name: "fuse_brelu"
-    type: BOOLEAN
-  }
-  attrs {
-    name: "fuse_brelu_threshold"
-    type: FLOAT
-  }
   attrs {
     name: "fuse_activation"
     type: STRING
@@ -134,4 +126,3 @@ extra {
     type: BOOLEAN
   }
 }
-
diff --git a/paddle/fluid/operators/compat/depthwise_conv2d.pbtxt b/paddle/fluid/operators/compat/depthwise_conv2d.pbtxt
index ded143986159f..1fbb99c03e833 100644
--- a/paddle/fluid/operators/compat/depthwise_conv2d.pbtxt
+++ b/paddle/fluid/operators/compat/depthwise_conv2d.pbtxt
@@ -69,14 +69,6 @@ extra {
     name: "fuse_relu"
     type: BOOLEAN
   }
-  attrs {
-    name: "fuse_brelu"
-    type: BOOLEAN
-  }
-  attrs {
-    name: "fuse_brelu_threshold"
-    type: FLOAT
-  }
   attrs {
     name: "fuse_activation"
     type: STRING
@@ -126,4 +118,3 @@ extra {
     type: BOOLEAN
   }
 }
-
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 405794783812b..f084862b419d5 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -348,14 +348,6 @@ void Conv2DOpMaker::Make() {
   AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false)
       .AsExtra();
-  AddAttr<bool>("fuse_brelu",
-                "(bool, default false) Only used in mkldnn kernel")
-      .SetDefault(false)
-      .AsExtra();
-  AddAttr<float>("fuse_brelu_threshold",
-                 "(float, default false 6.0) Only used in mkldnn kernel")
-      .SetDefault(6.0f)
-      .AsExtra();
   AddAttr<std::string>("fuse_activation",
                        "(string, default \"\") Only used in mkldnn kernel")
       .SetDefault("")
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
index d1a1aa3008c8b..070bf9511a9fe 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
@@ -145,8 +145,7 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
     binary_prim->execute(astream, args);
     astream.wait();
 
-    z->set_layout(DataLayout::kMKLDNN);
-    z->set_format(platform::GetMKLDNNFormat(*dst_memory));
+    z->set_mem_desc(dst_memory->get_desc());
   }
 };
 
@@ -179,7 +178,7 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
         onednn_engine);
 
     auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
-        dout->format(), platform::to_void_cast(dout->data<T>()));
+        dout->mem_desc(), platform::to_void_cast(dout->data<T>()));
 
     auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
 
@@ -189,7 +188,7 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
       // elementwise_add & elementwise_sub
       if (BINARY_OP == dnnl::algorithm::binary_add ||
           BINARY_OP == dnnl::algorithm::binary_sub) {
-        dst_memory = reorder_handler.AcquireDstMemory(dx, dout->format(),
+        dst_memory = reorder_handler.AcquireDstMemory(dx, dout->mem_desc(),
                                                       ctx.GetPlace());
         auto reorder_p =
             reorder_handler.AcquireReorder(dst_memory, reorder_src_memory_p);
@@ -218,8 +217,7 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
       }
       astream.wait();
 
-      dx->set_layout(framework::DataLayout::kMKLDNN);
-      dx->set_format(platform::GetMKLDNNFormat(*dst_memory));
+      dx->set_mem_desc(dst_memory->get_desc());
     }
 
     if (dy) {
@@ -232,7 +230,7 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
           BINARY_OP == dnnl::algorithm::binary_sub) {
         if (dout->dims() == dy->dims()) {
           auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
-              dy, dout->format(), ctx.GetPlace());
+              dy, dout->mem_desc(), ctx.GetPlace());
 
           dnnl::primitive_attr reorder_attr;
           std::vector<float> scales(1);
@@ -301,7 +299,6 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
         dst_memory = dst_dy_memory;
       }
       astream.wait();
-      dy->set_layout(DataLayout::kMKLDNN);
 
       if (dout->dims() != dy->dims()) {
         // Broadcasting
@@ -324,10 +321,10 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
                                           {DNNL_ARG_DST, *dst_memory},
                                       });
         astream.wait();
-        dy->set_format(platform::GetMKLDNNFormat(dst_memory->get_desc().reshape(
-            phi::vectorize<int64_t>(dy->dims()))));
+        dy->set_mem_desc(dst_memory->get_desc().reshape(
+            phi::vectorize<int64_t>(dy->dims())));
       } else {
-        dy->set_format(platform::GetMKLDNNFormat(*dst_memory));
+        dy->set_mem_desc(dst_memory->get_desc());
       }
     }
   }
diff --git a/paddle/fluid/operators/fake_quantize_op.cu.h b/paddle/fluid/operators/fake_quantize_op.cu.h
index 6c068d25d07a8..a6130c272d72b 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu.h
+++ b/paddle/fluid/operators/fake_quantize_op.cu.h
@@ -217,16 +217,18 @@ __global__ void ClipAndQuantKernel(const T* in, const T* scale,
   int bid = threadIdx.x + blockIdx.x * blockDim.x;
   int tid = threadIdx.x;
 
-  T s = scale[0];
-  T inv_s = inverse(s);
-  T bin_cnt_t = static_cast<T>(bin_cnt);
+  using ComputeDataType = typename QuantizeDataType<T>::type;
+
+  ComputeDataType s = static_cast<ComputeDataType>(scale[0]);
+  ComputeDataType inv_s = inverse(s);
+  ComputeDataType bin_cnt_t = static_cast<ComputeDataType>(bin_cnt);
+
   for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
-    T x = in[i];
-    T v = x > s ? s : x;
+    ComputeDataType x = static_cast<ComputeDataType>(in[i]);
+    ComputeDataType v = x > s ? s : x;
     v = v < -s ? -s : v;
     v = bin_cnt_t * inv_s * v;
-    out[i] = static_cast<T>(
-        round(static_cast<typename QuantizeDataType<T>::type>(v)));
+    out[i] = static_cast<T>(round(v));
   }
 }
 
@@ -237,18 +239,19 @@ __global__ void ClipAndQuantDequantKernel(const T* in, const T* scale,
   int bid = threadIdx.x + blockIdx.x * blockDim.x;
   int tid = threadIdx.x;
 
-  T s = scale[0];
-  T inv_s = inverse(s);
-  T bin_cnt_t = static_cast<T>(bin_cnt);
+  using ComputeDataType = typename QuantizeDataType<T>::type;
+
+  ComputeDataType s = static_cast<ComputeDataType>(scale[0]);
+  ComputeDataType inv_s = inverse(s);
+  ComputeDataType bin_cnt_t = static_cast<ComputeDataType>(bin_cnt);
 
   for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
-    T x = in[i];
+    ComputeDataType x = static_cast<ComputeDataType>(in[i]);
     x = x > s ? s : x;
     x = x < -s ? -s : x;
     x = bin_cnt_t * inv_s * x;
-    x = static_cast<T>(
-        round(static_cast<typename QuantizeDataType<T>::type>(x)));
-    out[i] = (x * s) / bin_cnt_t;
+    x = round(x);
+    out[i] = static_cast<T>((x * s) / bin_cnt_t);
   }
 }
 
@@ -302,17 +305,18 @@ __global__ void ChannelClipAndQuantKernelQuantAxis0(const T* in, const T* scale,
   const T* in_c = in + blockIdx.x * channel_size;
   T* out_c = out + blockIdx.x * channel_size;
 
-  T s = scale[blockIdx.x];
-  T inv_s = inverse(s);
-  T bin_cnt_t = static_cast<T>(bin_cnt);
+  using ComputeDataType = typename QuantizeDataType<T>::type;
+
+  ComputeDataType s = static_cast<ComputeDataType>(scale[blockIdx.x]);
+  ComputeDataType inv_s = inverse(s);
+  ComputeDataType bin_cnt_t = static_cast<ComputeDataType>(bin_cnt);
 
   for (int64_t i = tid; i < channel_size; i += blockDim.x) {
-    T x = in_c[i];
-    T v = x > s ? s : x;
+    ComputeDataType x = static_cast<ComputeDataType>(in_c[i]);
+    ComputeDataType v = x > s ? s : x;
     v = v < -s ? -s : v;
     v = bin_cnt_t * inv_s * v;
-    out_c[i] = static_cast<T>(
-        round(static_cast<typename QuantizeDataType<T>::type>(v)));
+    out_c[i] = static_cast<T>(round(v));
   }
 }
 
@@ -322,16 +326,17 @@ __global__ void ChannelClipAndQuantKernelQuantAxisN(
     const T* in, const T* scale, const int bin_cnt, const int64_t n,
     const int nScale, const int quant_stride, T* out) {
   int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
-  T bin_cnt_t = static_cast<T>(bin_cnt);
+  using ComputeDataType = typename QuantizeDataType<T>::type;
+  ComputeDataType bin_cnt_t = static_cast<ComputeDataType>(bin_cnt);
   for (int64_t i = idx; i < n; i += blockDim.x * gridDim.x) {
-    T s = scale[(i / quant_stride) % nScale];
-    T inv_s = inverse(s);
-    T x = in[i];
-    T v = x > s ? s : x;
+    ComputeDataType s =
+        static_cast<ComputeDataType>(scale[(i / quant_stride) % nScale]);
+    ComputeDataType inv_s = inverse(s);
+    ComputeDataType x = static_cast<ComputeDataType>(in[i]);
+    ComputeDataType v = x > s ? s : x;
     v = v < -s ? -s : v;
     v = bin_cnt_t * inv_s * v;
-    out[i] = static_cast<T>(
-        round(static_cast<typename QuantizeDataType<T>::type>(v)));
+    out[i] = static_cast<T>(round(v));
   }
 }
 
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
old mode 100644
new mode 100755
index 03351dbca09e5..e23891d899de6
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -22,8 +22,10 @@ register_operators(EXCLUDES
     fused_transformer_op
     fused_feedforward_op
     fused_multi_transformer_op
+    fused_bias_dropout_residual_layer_norm_op
     resnet_unit_op
-    fused_gemm_epilogue_op)
+    fused_gemm_epilogue_op
+    fused_gate_attention_op)
 
 # fusion_gru_op does not have CUDA kernel
 op_library(fusion_gru_op)
@@ -58,6 +60,7 @@ if (WITH_GPU OR WITH_ROCM)
     op_library(yolo_box_head_op)
     op_library(yolo_box_post_op)
     op_library(fused_embedding_eltwise_layernorm_op)
+    op_library(fused_gate_attention_op)
     # fusion_group
     if(NOT APPLE AND NOT WIN32)
         op_library(fusion_group_op DEPS device_code)
@@ -79,6 +82,7 @@ if (WITH_GPU OR WITH_ROCM)
         # fused_attention_op
         op_library(fused_attention_op)
         op_library(fused_multi_transformer_op)
+        op_library(fused_bias_dropout_residual_layer_norm_op)
     endif()
     # resnet_unit needs cudnn 8.0 above
     if ((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 8000))
diff --git a/paddle/fluid/operators/fused/attn_gemm.h b/paddle/fluid/operators/fused/attn_gemm.h
index 9542f0742ea34..304aad16ad0c6 100644
--- a/paddle/fluid/operators/fused/attn_gemm.h
+++ b/paddle/fluid/operators/fused/attn_gemm.h
@@ -1,8 +1,11 @@
 /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -13,6 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
 
 #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
@@ -21,6 +25,8 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
+
+using Tensor = framework::Tensor;
 // support gemm-nt and gemm-nn, which is used in fused_attention_op.
 template <typename T>
 class AttnMatMul {
@@ -45,31 +51,21 @@ class AttnMatMul {
                       framework::Tensor* bias_out) {
     // Note: for blas.GEMM API in Paddle, it treats all inputs as row-major.
     // here: (transa, transb): nt, input * weight.
-    CBLAS_TRANSPOSE transA = CblasNoTrans;
-    CBLAS_TRANSPOSE transB = CblasNoTrans;
-    if (transA_) {
-      transA = CblasTrans;
-    }
-    if (transB_) {
-      transB = CblasTrans;
-    }
+    CBLAS_TRANSPOSE transA = transA_ ? CblasTrans : CblasNoTrans;
+    CBLAS_TRANSPOSE transB = transB_ ? CblasTrans : CblasNoTrans;
     T alpha = static_cast<T>(1.0);
     T beta = static_cast<T>(0.0);
 
-    // here: (m, n, k) = bsz_seq, output_size, input_size, (input, weight, out)
+    // (m, n, k) = bsz_seq, output_size, input_size, (input, weight, out)
     auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
     blas.GEMM(transA, transB, bsz_seq_, output_size_, input_size_, alpha,
               input->data<T>(), weight->data<T>(), beta, output->data<T>());
     if (compute_bias_) {
-      // compute output + bias
-      std::vector<const Tensor*> ins;
-      std::vector<Tensor*> outs;
-      ins.emplace_back(output);
-      ins.emplace_back(bias);
-      outs.emplace_back(bias_out);
-      int elewise_add_axis = -1;
+      // bias_out = output + bias
+      std::vector<const Tensor*> ins = {output, bias};
+      std::vector<Tensor*> outs = {bias_out};
       phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
-          dev_ctx_, ins, &outs, elewise_add_axis, phi::funcs::AddFunctor<T>());
+          dev_ctx_, ins, &outs, -1, phi::funcs::AddFunctor<T>());
     }
   }
 
@@ -77,82 +73,71 @@ class AttnMatMul {
                        const framework::Tensor* weight,
                        const framework::Tensor* d_output,
                        framework::Tensor* d_input, framework::Tensor* d_weight,
-                       framework::Tensor* d_bias) {
+                       framework::Tensor* d_bias, bool use_addto = false) {
     T alpha = static_cast<T>(1.0);
-    T beta = static_cast<T>(0.0);
-    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
-
-    CBLAS_TRANSPOSE dB_transA = CblasNoTrans;
-    CBLAS_TRANSPOSE dB_transB = CblasNoTrans;
-    CBLAS_TRANSPOSE dA_transA = CblasNoTrans;
-    CBLAS_TRANSPOSE dA_transB = CblasNoTrans;
-    int dB_m = 1;
-    int dB_n = 1;
-    int dB_k = 1;
-    int dA_m = 1;
-    int dA_n = 1;
-    int dA_k = 1;
-
-    T* dB_input_1_ptr = nullptr;
-    T* dB_input_2_ptr = nullptr;
-    T* dB_output_ptr = d_weight->data<T>();
-
-    T* dA_input_1_ptr = nullptr;
-    T* dA_input_2_ptr = nullptr;
-    T* dA_output_ptr = d_input->data<T>();
+    T beta_dA = use_addto ? static_cast<T>(1.0) : static_cast<T>(0.0);
+    T beta_dB = static_cast<T>(0.0);
 
+    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
     if (!transA_) {
-      // fw: gemm-nt
+      // forward: gemm-nt
       if (transB_) {
-        // bw: gemm-tn, dB = (dC)^t * A
-        dB_transA = CblasTrans;
-        dB_transB = CblasNoTrans;
-        dB_m = output_size_;
-        dB_n = input_size_;
-        dB_k = bsz_seq_;
-
-        // bw: gemm-nn, dA = dC * B
-        dA_transA = CblasNoTrans;
-        dA_transB = CblasNoTrans;
-        dA_m = bsz_seq_;
-        dA_n = input_size_;
-        dA_k = output_size_;
-
-        blas.GEMM(dB_transA, dB_transB, dB_m, dB_n, dB_k, alpha,
-                  d_output->data<T>(), input->data<T>(), beta, dB_output_ptr);
-        blas.GEMM(dA_transA, dA_transB, dA_m, dA_n, dA_k, alpha,
-                  d_output->data<T>(), weight->data<T>(), beta, dA_output_ptr);
+        // backward: gemm-tn, dB = (dC)^T * A
+        if (d_weight) {
+          int dB_m = output_size_;
+          int dB_n = input_size_;
+          int dB_k = bsz_seq_;
+
+          T* dB_output_ptr = d_weight->data<T>();
+          blas.GEMM(CblasTrans, CblasNoTrans, dB_m, dB_n, dB_k, alpha,
+                    d_output->data<T>(), input->data<T>(), beta_dB,
+                    dB_output_ptr);
+        }
+
+        // backward: gemm-nn, dA = dC * B
+        if (d_input) {
+          int dA_m = bsz_seq_;
+          int dA_n = input_size_;
+          int dA_k = output_size_;
+
+          T* dA_output_ptr = d_input->data<T>();
+          blas.GEMM(CblasNoTrans, CblasNoTrans, dA_m, dA_n, dA_k, alpha,
+                    d_output->data<T>(), weight->data<T>(), beta_dA,
+                    dA_output_ptr);
+        }
       } else {  // fw: gemm-nn
-        // bw: gemm-tn, dB = A^t * dC
-        dB_transA = CblasTrans;
-        dB_transB = CblasNoTrans;
-        dB_m = input_size_;
-        dB_n = output_size_;
-        dB_k = bsz_seq_;
-
-        // bw: gemm-nt, dA = dC * B^t
-        dA_transA = CblasNoTrans;
-        dA_transB = CblasTrans;
-        dA_m = bsz_seq_;
-        dA_n = input_size_;
-        dA_k = output_size_;
-
-        blas.GEMM(dB_transA, dB_transB, dB_m, dB_n, dB_k, alpha,
-                  input->data<T>(), d_output->data<T>(), beta, dB_output_ptr);
-        blas.GEMM(dA_transA, dA_transB, dA_m, dA_n, dA_k, alpha,
-                  d_output->data<T>(), weight->data<T>(), beta, dA_output_ptr);
+        // backward: gemm-tn, dB = A^T * dC
+        if (d_weight) {
+          int dB_m = input_size_;
+          int dB_n = output_size_;
+          int dB_k = bsz_seq_;
+
+          T* dB_output_ptr = d_weight->data<T>();
+          blas.GEMM(CblasTrans, CblasNoTrans, dB_m, dB_n, dB_k, alpha,
+                    input->data<T>(), d_output->data<T>(), beta_dB,
+                    dB_output_ptr);
+        }
+
+        // backward: gemm-nt, dA = dC * B^T
+        if (d_input) {
+          int dA_m = bsz_seq_;
+          int dA_n = input_size_;
+          int dA_k = output_size_;
+
+          T* dA_output_ptr = d_input->data<T>();
+          blas.GEMM(CblasNoTrans, CblasTrans, dA_m, dA_n, dA_k, alpha,
+                    d_output->data<T>(), weight->data<T>(), beta_dA,
+                    dA_output_ptr);
+        }
       }
-    } else if (transB_) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "AttnMatMul wrapper do not support (transA=T, transB=T)"
-          "parameters."));
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
-          "AttnMatMul wrapper do not support (transA=T, transB=N)"
+          "AttnMatMul wrapper do not support (transA=T, transB=T/N)"
           "parameters."));
     }
-    if (compute_bias_) {
-      // reduce: {0, 1, 2, 3, 4} -> {2, 3, 4} or {0, 1, 2} -> {2}
+    if (compute_bias_ && d_bias) {
+      // reduce: {0, 1, 2, 3, 4} -> {2, 3, 4} or {0, 1, 2} -> {2} or {0,1,2,3}
+      // -> {3} or {0,1,2,3,4} -> {3,4}
       const auto input_dims = d_output->dims();
       const auto output_dims = d_bias->dims();
       bool support_case_1 =
@@ -163,11 +148,22 @@ class AttnMatMul {
       bool support_case_2 =
           (input_dims.size() == 3 && output_dims.size() == 1 &&
            (input_dims[2] == output_dims[0]));
+      bool support_case_3 =
+          (input_dims.size() == 4 && output_dims.size() == 1 &&
+           input_dims[3] == output_dims[0]);
+      bool support_case_4 =
+          (input_dims.size() == 5 && output_dims.size() == 2 &&
+           input_dims[3] == output_dims[0] && input_dims[4] == output_dims[1]);
+
+      gpuStream_t stream = dev_ctx_.stream();
       if (support_case_1 || support_case_2) {
-        gpuStream_t stream = dev_ctx_.stream();
         TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
             dev_ctx_, *d_output, d_bias, kps::IdentityFunctor<T>(), {0, 1},
             stream);
+      } else if (support_case_3 || support_case_4) {
+        TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+            dev_ctx_, *d_output, d_bias, kps::IdentityFunctor<T>(), {0, 1, 2},
+            stream);
       } else {
         PADDLE_THROW(platform::errors::InvalidArgument(
             "Only support reduce when the input dims are [0,1,2,3,4] and "
diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h
index 0e9fba73933b7..38f9aff226ea9 100644
--- a/paddle/fluid/operators/fused/fmha_ref.h
+++ b/paddle/fluid/operators/fused/fmha_ref.h
@@ -1,8 +1,11 @@
 /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -297,7 +300,6 @@ class FMHARef {
       phi::SoftmaxBackwardCUDAKernelDriver<T>(
           dev_ctx_, softmax_out_tensor, *softmax_out_grad_tensor, softmax_axis,
           src_mask_out_grad_tensor);
-
       // recall LaunchElementwiseCudaKernel fw:  src_mask_out = qk_out +
       // src_mask
       // Special case when dy is not needed and dx doesn't reduce
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cc b/paddle/fluid/operators/fused/fused_attention_op.cc
index 1f377810a2287..a1adec9641a6e 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cc
+++ b/paddle/fluid/operators/fused/fused_attention_op.cc
@@ -194,7 +194,7 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
     // the same as QKOut's shape.
     ctx->SetOutputDim("AttnDropoutOut",
                       {x_dim[0], y_dim[1], x_dim[1], out_seq_len});
-    if (ctx->Attrs().Get<bool>("attn_dropout_is_test") == false) {
+    if (ctx->Attrs().Get<bool>("is_test") == false) {
       ctx->SetOutputDim("AttnDropoutMaskOut",
                         {x_dim[0], y_dim[1], x_dim[1], out_seq_len});
     }
@@ -206,7 +206,7 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("FMHAOut", {x_dim[0], x_dim[1], y_dim[1], y_dim[2]});
     ctx->SetOutputDim("OutLinearOut", ctx->GetInputDim("X"));
 
-    if (ctx->Attrs().Get<bool>("dropout_is_test") == false) {
+    if (ctx->Attrs().Get<bool>("is_test") == false) {
       ctx->SetOutputDim("DropoutMaskOut", ctx->GetInputDim("X"));
     }
 
@@ -301,7 +301,7 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
               platform::errors::InvalidArgument(
                   "'attn_dropout_rate' must be between 0.0 and 1.0."));
         });
-    AddAttr<bool>("attn_dropout_is_test",
+    AddAttr<bool>("is_test",
                   "(bool, default false) Set to true for inference only, false "
                   "for training. Some layers may run faster when this is true.")
         .SetDefault(false);
@@ -345,11 +345,6 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
                             platform::errors::InvalidArgument(
                                 "'dropout_rate' must be between 0.0 and 1.0."));
         });
-
-    AddAttr<bool>("dropout_is_test",
-                  "(bool, default false) Set to true for inference only, false "
-                  "for training. Some layers may run faster when this is true.")
-        .SetDefault(false);
     AddAttr<bool>("dropout_fix_seed",
                   "A flag indicating whether to use a fixed seed to generate "
                   "random mask. NOTE: DO NOT set this flag to true in "
@@ -418,10 +413,9 @@ class FusedAttentionGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->Attrs().Get<bool>("attn_dropout_is_test"), false,
-        platform::errors::InvalidArgument(
-            "GradOp is only callable when attn_dropout_is_test is false"));
+    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("is_test"), false,
+                      platform::errors::InvalidArgument(
+                          "GradOp is only callable when is_test is false"));
 
     if (ctx->Attrs().Get<bool>("pre_layer_norm") == false) {
       OP_INOUT_CHECK(ctx->HasInput("Ln2Mean"), "Input", "Ln2Mean",
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu
index ec8a4d962e808..f25bd53992894 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_attention_op.cu
@@ -109,7 +109,7 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
     const float ln_epsilon = ctx.Attr<float>("ln_epsilon");
 
     float attn_dropout_rate = ctx.Attr<float>("attn_dropout_rate");
-    bool is_test_1 = ctx.Attr<bool>("attn_dropout_is_test");
+    bool is_test_1 = ctx.Attr<bool>("is_test");
     auto &dropout_implementation_1 =
         ctx.Attr<std::string>("attn_dropout_implementation");
     bool is_upscale_in_train_1 =
@@ -280,7 +280,7 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
     const float ln2epsilon = ctx.Attr<float>("ln_epsilon");
 
     float attn_dropout_prob = ctx.Attr<float>("attn_dropout_rate");
-    bool is_test_1 = ctx.Attr<bool>("attn_dropout_is_test");
+    bool is_test_1 = ctx.Attr<bool>("is_test");
     auto &dropout_implementation_1 =
         ctx.Attr<std::string>("attn_dropout_implementation");
     bool is_upscale_in_train_1 =
diff --git a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
new file mode 100644
index 0000000000000..781f51d70ec66
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
@@ -0,0 +1,239 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class FusedBiasDropoutResidualLnOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X",
+                   "FusedBiasDropoutResidualLnOp");
+    OP_INOUT_CHECK(ctx->HasOutput("LnMean"), "Output", "LnMean",
+                   "FusedBiasDropoutResidualLnOp");
+    OP_INOUT_CHECK(ctx->HasOutput("LnVariance"), "Output", "LnVariance",
+                   "FusedBiasDropoutResidualLnOp");
+    OP_INOUT_CHECK(ctx->HasOutput("BiasDropoutResidualOut"), "Output",
+                   "BiasDropoutResidualOut", "FusedBiasDropoutResidualLnOp");
+    OP_INOUT_CHECK(ctx->HasOutput("DropoutMaskOut"), "Output", "DropoutMaskOut",
+                   "FusedBiasDropoutResidualLnOp");
+    OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y",
+                   "FusedBiasDropoutResidualLnOp");
+    auto x_dim = ctx->GetInputDim("X");
+    int left = 1;
+    for (int i = 0; i < x_dim.size() - 1; i++) {
+      left *= x_dim[i];
+    }
+    ctx->SetOutputDim("BiasDropoutResidualOut", ctx->GetInputDim("X"));
+    if (ctx->Attrs().Get<bool>("is_test") == false) {
+      ctx->SetOutputDim("DropoutMaskOut", ctx->GetInputDim("X"));
+    }
+    ctx->SetOutputDim("LnMean", {left});
+    ctx->SetOutputDim("LnVariance", {left});
+    ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input = ctx.Input<Tensor>("X");
+    auto input_data_type = framework::TransToProtoVarType(input->dtype());
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
+};
+
+class FusedBiasDropoutResidualLnOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input tensor.");
+    AddInput("Residual", "The residual tensor.");
+    AddInput("Bias", "The linear bias tensor.").AsDispensable();
+    AddInput("LnScale",
+             "(optional) Scale is a 1-dimensional tensor of size "
+             "H. Here, H represents the last dimension of its input tensor.")
+        .AsDispensable();
+    AddInput("LnBias",
+             "(optional) Bias is a 1-dimensional tensor of size "
+             "H. Here, H represents the last dimension of its input tensor.")
+        .AsDispensable();
+    AddOutput("BiasDropoutResidualOut", "Output of bias + dropout + residual.")
+        .AsIntermediate();
+    AddOutput("DropoutMaskOut", "The random sampled dropout mask.")
+        .AsIntermediate();
+    AddOutput("LnMean", "Mean of the current mini batch.").AsIntermediate();
+    AddOutput("LnVariance", "Variance of the current mini batch.")
+        .AsIntermediate();
+    AddOutput("Y", "Result.");
+    AddAttr<float>("dropout_rate", "Probability of setting units to zero.")
+        .SetDefault(.5f)
+        .AddCustomChecker([](const float &drop_p) {
+          PADDLE_ENFORCE_EQ(drop_p >= 0.0f && drop_p <= 1.0f, true,
+                            platform::errors::InvalidArgument(
+                                "'dropout_rate' must be between 0.0 and 1.0."));
+        });
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
+    AddAttr<bool>("dropout_fix_seed",
+                  "A flag indicating whether to use a fixed seed to generate "
+                  "random mask. NOTE: DO NOT set this flag to true in "
+                  "training. Setting this flag to true is only useful in "
+                  "unittest or for debug that always the same output units "
+                  "will be dropped.")
+        .SetDefault(true);
+    AddAttr<int>("dropout_seed", "Dropout random seed.").SetDefault(0);
+    AddAttr<std::string>(
+        "dropout_implementation",
+        "[\"downgrade_in_infer\"|\"upscale_in_train\"]"
+        "The meaning is the same as 'attn_dropout_implementation'.")
+        .SetDefault("downgrade_in_infer")
+        .AddCustomChecker([](const std::string &type) {
+          PADDLE_ENFORCE_EQ(
+              type == "downgrade_in_infer" || type == "upscale_in_train", true,
+              platform::errors::InvalidArgument(
+                  "dropout_implementation can only be downgrade_in_infer or "
+                  "upscale_in_train"));
+        });
+    AddAttr<float>("ln_epsilon",
+                   "Constant for numerical stability [default 1e-5].")
+        .SetDefault(1e-5)
+        .AddCustomChecker([](const float &ln_epsilon) {
+          PADDLE_ENFORCE_EQ(ln_epsilon >= 0.0f && ln_epsilon <= 0.001f, true,
+                            platform::errors::InvalidArgument(
+                                "'epsilon' of the LayerNorm should be between "
+                                "0.0 and 0.001, But received [%s].",
+                                ln_epsilon));
+        });
+
+    AddComment(R"DOC(
+    Add fused bias_dropout_residual_layer_norm op whose logic is as follows:
+    // @input: [batch_size, seq_len, embed_dim] 
+    // @final_out: [batch_size, seq_len, embed_dim] 
+    y = layer_norm(residual + dropout(bias + x));
+    )DOC");
+  }
+};
+
+class FusedBiasDropoutResidualLnGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("is_test"), false,
+                      platform::errors::InvalidArgument(
+                          "GradOp is only callable when is_test is false"));
+    OP_INOUT_CHECK(ctx->HasInput("LnMean"), "Input", "LnMean",
+                   "FusedBiasDropoutResidualLnGrad");
+    OP_INOUT_CHECK(ctx->HasInput("LnVariance"), "Input", "LnVariance",
+                   "FusedBiasDropoutResidualLnGrad");
+    OP_INOUT_CHECK(ctx->HasInput("BiasDropoutResidualOut"), "Input",
+                   "BiasDropoutResidualOut", "FusedBiasDropoutResidualLnGrad");
+    if (ctx->HasOutput(framework::GradVarName("LnScale"))) {
+      ctx->SetOutputDim(framework::GradVarName("LnScale"),
+                        ctx->GetInputDim("LnScale"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("LnBias"))) {
+      ctx->SetOutputDim(framework::GradVarName("LnBias"),
+                        ctx->GetInputDim("LnBias"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Residual"))) {
+      ctx->SetOutputDim(framework::GradVarName("Residual"),
+                        ctx->GetInputDim("Residual"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Bias"))) {
+      ctx->SetOutputDim(framework::GradVarName("Bias"),
+                        ctx->GetInputDim("Bias"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    }
+    ctx->SetOutputDim(framework::GradVarName("BiasDropoutResidualOut"),
+                      ctx->GetInputDim("BiasDropoutResidualOut"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input = ctx.Input<Tensor>("X");
+    auto input_data_type = framework::TransToProtoVarType(input->dtype());
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class FusedBiasDropoutResidualLnGradOpMaker
+    : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("fused_bias_dropout_residual_layer_norm_grad");
+    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Residual", this->Input("Residual"));
+    if (this->HasInput("Bias")) {
+      op->SetInput("Bias", this->Input("Bias"));
+      op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias"));
+    }
+    if (this->HasInput("LnScale")) {
+      op->SetInput("LnScale", this->Input("LnScale"));
+      op->SetOutput(framework::GradVarName("LnScale"),
+                    this->InputGrad("LnScale"));
+    }
+    if (this->HasInput("LnBias")) {
+      op->SetInput("LnBias", this->Input("LnBias"));
+      op->SetOutput(framework::GradVarName("LnBias"),
+                    this->InputGrad("LnBias"));
+    }
+    if (this->HasOutput("LnMean")) {
+      op->SetInput("LnMean", this->Output("LnMean"));
+    }
+    if (this->HasOutput("LnVariance")) {
+      op->SetInput("LnVariance", this->Output("LnVariance"));
+    }
+    if (this->HasOutput("BiasDropoutResidualOut")) {
+      op->SetInput("BiasDropoutResidualOut",
+                   this->Output("BiasDropoutResidualOut"));
+    }
+    op->SetInput("DropoutMaskOut", this->Output("DropoutMaskOut"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Residual"),
+                  this->InputGrad("Residual"));
+    op->SetOutput(framework::GradVarName("BiasDropoutResidualOut"),
+                  this->OutputGrad("BiasDropoutResidualOut"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    fused_bias_dropout_residual_layer_norm, ops::FusedBiasDropoutResidualLnOp,
+    ops::FusedBiasDropoutResidualLnOpMaker,
+    ops::FusedBiasDropoutResidualLnGradOpMaker<paddle::framework::OpDesc>,
+    ops::FusedBiasDropoutResidualLnGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(fused_bias_dropout_residual_layer_norm_grad,
+                  ops::FusedBiasDropoutResidualLnGradOp);
diff --git a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
new file mode 100644
index 0000000000000..71a2c9728cc6b
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
@@ -0,0 +1,148 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cuda_fp16.h>
+#include <cub/cub.cuh>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/fused/fused_dropout_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class FusedBiasDropoutResidualLnOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    using U = LayerNormParamType<T>;
+    auto *input_x = ctx.Input<Tensor>("X");
+    auto *bias = ctx.Input<Tensor>("Bias");
+    auto *residual = ctx.Input<Tensor>("Residual");
+    const float ln_epsilon = ctx.Attr<float>("ln_epsilon");
+    auto *ln_scale = ctx.Input<Tensor>("LnScale");
+    auto *ln_bias = ctx.Input<Tensor>("LnBias");
+    auto *dropout_mask_out = ctx.Output<Tensor>("DropoutMaskOut");
+    auto *bias_dropout_residual_out =
+        ctx.Output<Tensor>("BiasDropoutResidualOut");
+    auto *ln_mean = ctx.Output<Tensor>("LnMean");
+    auto *ln_var = ctx.Output<Tensor>("LnVariance");
+    auto *y = ctx.Output<Tensor>("Y");
+    auto *x_data = input_x->data<T>();
+    auto *bias_data = (bias == nullptr) ? nullptr : bias->data<T>();
+    auto *residual_data = (residual == nullptr) ? nullptr : residual->data<T>();
+    auto *ln_scale_data = (ln_scale == nullptr ? nullptr : ln_scale->data<U>());
+    auto *ln_bias_data = (ln_bias == nullptr ? nullptr : ln_bias->data<U>());
+    auto *bias_dropout_residual_out_data =
+        bias_dropout_residual_out->mutable_data<T>(ctx.GetPlace());
+    auto *ln_mean_data = ln_mean->mutable_data<U>(ctx.GetPlace());
+    auto *ln_var_data = ln_var->mutable_data<U>(ctx.GetPlace());
+    auto *dropout_mask_out_data =
+        dropout_mask_out->mutable_data<uint8_t>(ctx.GetPlace());
+    auto *y_data = y->mutable_data<T>(ctx.GetPlace());
+
+    const auto input_x_dims = input_x->dims();
+    int bsz_seq = 1;
+    for (int i = 0; i < input_x_dims.size() - 1; i++) {
+      bsz_seq *= input_x_dims[i];
+    }
+    int dim_embed = input_x_dims[input_x_dims.size() - 1];
+    DropoutParam dropout_param(ctx, 0);
+    FusedDropoutLayerNormHelper<T, uint8_t> fused_dropout_layernorm_helper(
+        ctx.cuda_device_context(), bsz_seq, dim_embed, dropout_param,
+        ln_epsilon);
+    // output = layernorm(residual + dropout(input + bias))
+    fused_dropout_layernorm_helper.LayernormResidualDropoutBias(
+        ctx.cuda_device_context(), x_data, residual_data, bias_data,
+        ln_scale_data, ln_bias_data, bias_dropout_residual_out_data,
+        dropout_mask_out_data, y_data, ln_mean_data, ln_var_data);
+  }
+};
+
+template <typename T>
+class FusedBiasDropoutResidualLnGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    using U = LayerNormParamType<T>;
+    const float ln_epsilon = ctx.Attr<float>("ln_epsilon");
+
+    auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto *ln_scale = ctx.Input<Tensor>("LnScale");
+    auto *dropout_mask_out = ctx.Input<Tensor>("DropoutMaskOut");
+    auto *bias_dropout_residual_out =
+        ctx.Input<Tensor>("BiasDropoutResidualOut");
+    auto *ln_mean = ctx.Input<Tensor>("LnMean");
+    auto *ln_var = ctx.Input<Tensor>("LnVariance");
+    auto *d_y_data = d_y->data<T>();
+    auto *ln_scale_data = (ln_scale == nullptr ? nullptr : ln_scale->data<U>());
+    auto *dropout_mask_out_data = dropout_mask_out->data<uint8_t>();
+    auto *bias_dropout_residual_out_data = bias_dropout_residual_out->data<T>();
+    auto *ln_mean_data = ln_mean->data<U>();
+    auto *ln_var_data = ln_var->data<U>();
+
+    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_residual = ctx.Output<Tensor>(framework::GradVarName("Residual"));
+    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+    auto *d_bias_dropout_residual_out =
+        ctx.Output<Tensor>(framework::GradVarName("BiasDropoutResidualOut"));
+    auto *d_ln_scale = ctx.Output<Tensor>(framework::GradVarName("LnScale"));
+    auto *d_ln_bias = ctx.Output<Tensor>(framework::GradVarName("LnBias"));
+    auto *d_x_data = d_x->mutable_data<T>(ctx.GetPlace());
+    auto *d_residual_data = d_residual->mutable_data<T>(ctx.GetPlace());
+    auto *d_bias_dropout_residual_out_data =
+        d_bias_dropout_residual_out->mutable_data<T>(ctx.GetPlace());
+    auto *d_bias_data =
+        (d_bias == nullptr ? nullptr : d_bias->mutable_data<T>(ctx.GetPlace()));
+    auto *d_ln_scale_data =
+        (d_ln_scale == nullptr ? nullptr
+                               : d_ln_scale->mutable_data<U>(ctx.GetPlace()));
+    auto *d_ln_bias_data =
+        (d_ln_bias == nullptr ? nullptr
+                              : d_ln_bias->mutable_data<U>(ctx.GetPlace()));
+
+    const auto input_x_dims = d_y->dims();
+    int bsz_seq = 1;
+    for (int i = 0; i < input_x_dims.size() - 1; i++) {
+      bsz_seq *= input_x_dims[i];
+    }
+    int dim_embed = input_x_dims[input_x_dims.size() - 1];
+    DropoutParam dropout_param(ctx, 0);
+    FusedDropoutLayerNormHelper<T, uint8_t> fused_dropout_layernorm_helper(
+        ctx.cuda_device_context(), bsz_seq, dim_embed, dropout_param,
+        ln_epsilon);
+    fused_dropout_layernorm_helper.LayernormResidualDropoutBiasGrad(
+        ctx.cuda_device_context(), d_y_data, bias_dropout_residual_out_data,
+        dropout_mask_out_data, ln_scale_data, ln_mean_data, ln_var_data,
+        d_bias_dropout_residual_out_data, d_ln_scale_data, d_ln_bias_data,
+        d_x_data, d_bias_data, d_residual_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(fused_bias_dropout_residual_layer_norm,
+                        ops::FusedBiasDropoutResidualLnOpKernel<float>,
+                        ops::FusedBiasDropoutResidualLnOpKernel<double>,
+                        ops::FusedBiasDropoutResidualLnOpKernel<plat::float16>);
+REGISTER_OP_CUDA_KERNEL(
+    fused_bias_dropout_residual_layer_norm_grad,
+    ops::FusedBiasDropoutResidualLnGradKernel<float>,
+    ops::FusedBiasDropoutResidualLnGradKernel<double>,
+    ops::FusedBiasDropoutResidualLnGradKernel<plat::float16>);
diff --git a/paddle/fluid/operators/fused/fused_dropout_helper.h b/paddle/fluid/operators/fused/fused_dropout_helper.h
index 0a33a60f8123d..c352f08ec2ba7 100644
--- a/paddle/fluid/operators/fused/fused_dropout_helper.h
+++ b/paddle/fluid/operators/fused/fused_dropout_helper.h
@@ -82,7 +82,7 @@ struct DropoutParam {
     auto& dropout_implementation =
         context.Attr<std::string>(pre_fix + "implementation");
     is_upscale_in_train = (dropout_implementation == "upscale_in_train");
-    is_test = context.Attr<bool>(pre_fix + "is_test");
+    is_test = context.Attr<bool>("is_test");
     fix_seed = context.Attr<bool>(pre_fix + "fix_seed");
 
     std::string str_seed = "Dropout";
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cc b/paddle/fluid/operators/fused/fused_feedforward_op.cc
index f3f8f17427577..8e15232acda90 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cc
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cc
@@ -61,14 +61,14 @@ class FusedFeedForwardOp : public framework::OperatorWithKernel {
     tmp_dim_x[dim_x.size() - 1] =
         dim_Linear1Weight[dim_Linear1Weight.size() - 1];
     context->SetOutputDim("Out", dim_x);
-    if (context->Attrs().Get<bool>("dropout1_is_test") == false) {
+    if (context->Attrs().Get<bool>("is_test") == false) {
       context->SetOutputDim("Dropout1Mask", tmp_dim_x);
     }
     context->SetOutputDim("Dropout1Out", tmp_dim_x);
     context->SetOutputDim("Linear1Out", tmp_dim_x);
     context->SetOutputDim("Dropout2Out", dim_x);
 
-    if (context->Attrs().Get<bool>("dropout2_is_test") == false) {
+    if (context->Attrs().Get<bool>("is_test") == false) {
       context->SetOutputDim("Dropout2Mask", dim_x);
     }
     framework::DDim mean_dim =
@@ -185,9 +185,7 @@ class FusedFeedForwardOpMaker : public framework::OpProtoAndCheckerMaker {
                   "dropout2_implementation can only be downgrade_in_infer or "
                   "upscale_in_train"));
         });
-    AddAttr<bool>("dropout1_is_test", "the is_test of first dropout")
-        .SetDefault(false);
-    AddAttr<bool>("dropout2_is_test", "the is_test of second dropout")
+    AddAttr<bool>("is_test", "the is_test attribute of dropout")
         .SetDefault(false);
     AddAttr<bool>("dropout1_fix_seed", "the is_test of first dropout")
         .SetDefault(false);
@@ -218,10 +216,7 @@ class FusedFeedForwardOpGrad : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("dropout1_is_test"), false,
-                      platform::errors::InvalidArgument(
-                          "GradOp is only callable when is_test is false"));
-    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("dropout2_is_test"), false,
+    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("is_test"), false,
                       platform::errors::InvalidArgument(
                           "GradOp is only callable when is_test is false"));
     bool pre_layer_norm = ctx->Attrs().Get<bool>("pre_layer_norm");
diff --git a/paddle/fluid/operators/fused/fused_gate_attention.h b/paddle/fluid/operators/fused/fused_gate_attention.h
new file mode 100644
index 0000000000000..cda33987d68ac
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_gate_attention.h
@@ -0,0 +1,647 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
+#include "paddle/fluid/operators/transpose_op.cu.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+inline std::string MemoryDebugString(const Tensor& t) {
+  std::stringstream ss;
+  ss << "shape=[" << t.dims()
+     << "], size=" << static_cast<float>(t.memory_size()) / (1 << 20)
+     << " MB, ptr=" << t.data();
+
+  size_t total = 0;
+  size_t available = 0;
+  platform::GpuMemoryUsage(&available, &total);
+  ss << "; memory allocated="
+     << static_cast<float>(total - available) / (1 << 20) << " MB";
+  return ss.str();
+}
+
+template <typename T>
+struct TernaryAddFunctor {
+  inline HOSTDEVICE T operator()(T a, T b, T c) const { return a + b + c; }
+};
+
+template <typename T>
+struct GateAttentionConfig {
+ public:
+  int64_t batch_size;
+  int64_t seq_len_m;
+  int64_t seq_len_r;
+  int64_t q_dim;
+  int64_t kv_dim;
+  int64_t key_dim;
+  int64_t m_size;
+  int64_t num_heads;
+
+  phi::DDim qkv_out_dims;
+  phi::DDim qkv_transpose_out_dims;
+
+  phi::DDim q_out_dims;
+  phi::DDim kv_out_dims;
+  phi::DDim q_transpose_out_dims;
+  phi::DDim kv_transpose_out_dims;
+
+  phi::DDim qk_out_dims;
+  phi::DDim softmax_out_dims;
+  phi::DDim qktv_out_dims;
+  phi::DDim gate_out_dims;
+
+  GateAttentionConfig(const Tensor* query, const Tensor* key,
+                      const Tensor* query_weight, const Tensor* qkv_weight,
+                      bool merge_qkv) {
+    // query: shape=[batch_size, seq_len_m, seq_len_r, q_dim]
+    batch_size = query->dims()[0];
+    seq_len_m = query->dims()[1];
+    seq_len_r = query->dims()[2];
+    q_dim = query->dims()[3];
+
+    if (merge_qkv) {
+      PADDLE_ENFORCE_NOT_NULL(
+          qkv_weight,
+          platform::errors::NotFound("The input qkv_weight can not be nullptr "
+                                     "when merge_qkv is true."));
+
+      // When q_dim == kv_dim, QKV matmul can be computed merged.
+      // qkv_weight: shape=[3, num_heads, key_dim, q_dim]
+      num_heads = qkv_weight->dims()[1];
+      key_dim = qkv_weight->dims()[2];
+      m_size = seq_len_r;
+      kv_dim = q_dim;
+
+      qkv_out_dims = {batch_size, seq_len_m, seq_len_r, 3, num_heads, key_dim};
+      qkv_transpose_out_dims = {3,         batch_size, seq_len_m,
+                                num_heads, seq_len_r,  key_dim};
+    } else {
+      PADDLE_ENFORCE_NOT_NULL(
+          key,
+          platform::errors::NotFound(
+              "The input key can not be nullptr when merge_qkv is false."));
+      PADDLE_ENFORCE_NOT_NULL(
+          query_weight,
+          platform::errors::NotFound("The input query_weight can not be "
+                                     "nullptr when merge_qkv is false."));
+
+      // When q_dim != kv_dim, QKV matmul must be computed saparately.
+      // key: shape=[batch_size, seq_len_m, m_size, kv_dim]
+      // query_w: shape=[q_dim, num_heads, key_dim]
+      num_heads = query_weight->dims()[1];
+      key_dim = query_weight->dims()[2];
+      m_size = key->dims()[2];
+      kv_dim = key->dims()[3];
+
+      q_out_dims = {batch_size, seq_len_m, seq_len_r, num_heads, key_dim};
+      kv_out_dims = {batch_size, seq_len_m, m_size, num_heads, key_dim};
+      q_transpose_out_dims = {batch_size, seq_len_m, num_heads, seq_len_r,
+                              key_dim};
+      kv_transpose_out_dims = {batch_size, seq_len_m, num_heads, m_size,
+                               key_dim};
+    }
+
+    qk_out_dims = {batch_size, seq_len_m, num_heads, seq_len_r, m_size};
+    softmax_out_dims = {batch_size, seq_len_m, num_heads, seq_len_r, m_size};
+    qktv_out_dims = {batch_size, seq_len_m, num_heads, seq_len_r, key_dim};
+    gate_out_dims = {batch_size, seq_len_m, seq_len_r, num_heads, key_dim};
+  }
+
+  int64_t GetQuerySize() const {
+    return batch_size * seq_len_m * seq_len_r * num_heads * key_dim;
+  }
+
+  Tensor* GetQKVOut(const platform::CUDADeviceContext& dev_ctx) {
+    if (!qkv_out.IsInitialized()) {
+      qkv_out.Resize(qkv_out_dims);
+      qkv_out.mutable_data<T>(dev_ctx.GetPlace());
+      VLOG(4) << "qkv_out: " << MemoryDebugString(qkv_out);
+    }
+    return &qkv_out;
+  }
+
+  Tensor* GetQueryOut(const platform::CUDADeviceContext& dev_ctx) {
+    if (!query_out.IsInitialized()) {
+      query_out.Resize(q_out_dims);
+      query_out.mutable_data<T>(dev_ctx.GetPlace());
+      VLOG(4) << "query_out: " << MemoryDebugString(query_out);
+    }
+    return &query_out;
+  }
+
+  Tensor* GetKeyOut(const platform::CUDADeviceContext& dev_ctx) {
+    if (!key_out.IsInitialized()) {
+      key_out.Resize(kv_out_dims);
+      key_out.mutable_data<T>(dev_ctx.GetPlace());
+      VLOG(4) << "key_out: " << MemoryDebugString(key_out);
+    }
+    return &key_out;
+  }
+
+  Tensor* GetValueOut(const platform::CUDADeviceContext& dev_ctx) {
+    if (!value_out.IsInitialized()) {
+      value_out.Resize(kv_out_dims);
+      value_out.mutable_data<T>(dev_ctx.GetPlace());
+      VLOG(4) << "value_out: " << MemoryDebugString(value_out);
+    }
+    return &value_out;
+  }
+
+  Tensor* GetQKOut(const platform::CUDADeviceContext& dev_ctx,
+                   Tensor* softmax_out) {
+    // softmax_dim = qk_out_dim[-1] = qk_out_dim[rank - 1]
+    int softmax_dim = m_size;
+    if (!softmax_out || phi::UseCudnnSoftmax<T>(dev_ctx, softmax_dim, true)) {
+      // Not sure whether cudnn softmax can execute inplace.
+      if (!qkv_out.IsInitialized()) {
+        qk_out.Resize(qk_out_dims);
+        qk_out.mutable_data<T>(dev_ctx.GetPlace());
+        VLOG(4) << "qk_out: " << MemoryDebugString(qk_out);
+      }
+      return &qk_out;
+    } else {
+      return softmax_out;
+    }
+  }
+
+  void ClearQKVOut() {
+    if (qkv_out.IsInitialized()) {
+      qkv_out.clear();
+    }
+  }
+
+  void ClearQKOut() {
+    if (qk_out.IsInitialized()) {
+      qk_out.clear();
+    }
+  }
+
+ protected:
+  Tensor qkv_out;
+  // QKV is not merged
+  Tensor query_out;
+  Tensor key_out;
+  Tensor value_out;
+  // qk_out = BatchedGEMM(Q, K^T)
+  // qk_out: shape=[batch_size, seq_len_m, num_heads, seq_len_r, m_size]
+  // softmax_out = softmax(qk_out + nonbatched_bias + src_mask)
+  // The shape of qk_out, softmax_out is the same, thus can be called inplace.
+  Tensor qk_out;
+};
+
+template <typename T>
+struct GateAttentionGradConfig : public GateAttentionConfig<T> {
+ public:
+  GateAttentionGradConfig(const Tensor* query, const Tensor* key,
+                          const Tensor* query_weight, const Tensor* qkv_weight,
+                          bool merge_qkv)
+      : GateAttentionConfig<T>(query, key, query_weight, qkv_weight,
+                               merge_qkv) {}
+
+  Tensor* GetQKVOutGrad(const platform::CUDADeviceContext& dev_ctx) {
+    if (!qkv_out_grad.IsInitialized()) {
+      qkv_out_grad.Resize(this->qkv_out_dims);
+      qkv_out_grad.mutable_data<T>(dev_ctx.GetPlace());
+      VLOG(4) << "qkv_out_grad: " << MemoryDebugString(qkv_out_grad);
+    }
+    return &qkv_out_grad;
+  }
+
+  Tensor* GetQueryOutGrad(const platform::CUDADeviceContext& dev_ctx) {
+    if (!query_out_grad.IsInitialized()) {
+      query_out_grad.Resize(this->q_out_dims);
+      query_out_grad.mutable_data<T>(dev_ctx.GetPlace());
+      VLOG(4) << "query_out_grad: " << MemoryDebugString(query_out_grad);
+    }
+    return &query_out_grad;
+  }
+
+  Tensor* GetKeyOutGrad(const platform::CUDADeviceContext& dev_ctx) {
+    if (!key_out_grad.IsInitialized()) {
+      key_out_grad.Resize(this->kv_out_dims);
+      key_out_grad.mutable_data<T>(dev_ctx.GetPlace());
+      VLOG(4) << "key_out_grad: " << MemoryDebugString(key_out_grad);
+    }
+    return &key_out_grad;
+  }
+
+  Tensor* GetValueOutGrad(const platform::CUDADeviceContext& dev_ctx) {
+    if (!value_out_grad.IsInitialized()) {
+      value_out_grad.Resize(this->kv_out_dims);
+      value_out_grad.mutable_data<T>(dev_ctx.GetPlace());
+      VLOG(4) << "value_out_grad: " << MemoryDebugString(value_out_grad);
+    }
+    return &value_out_grad;
+  }
+
+  Tensor* GetQKOutGrad(const platform::CUDADeviceContext& dev_ctx,
+                       Tensor* softmax_out_grad) {
+    // softmax_dim = qk_out_dim[-1] = qk_out_dim[rank - 1]
+    int softmax_dim = this->m_size;
+    if (!softmax_out_grad ||
+        phi::UseCudnnSoftmax<T>(dev_ctx, softmax_dim, true)) {
+      if (!qk_out_grad.IsInitialized()) {
+        qk_out_grad.Resize(this->qk_out_dims);
+        qk_out_grad.mutable_data<T>(dev_ctx.GetPlace());
+        VLOG(4) << "qk_out_grad: " << MemoryDebugString(qk_out_grad);
+      }
+      return &qk_out_grad;
+    } else {
+      return softmax_out_grad;
+    }
+  }
+
+ protected:
+  Tensor qkv_out_grad;
+  Tensor query_out_grad;
+  Tensor key_out_grad;
+  Tensor value_out_grad;
+  Tensor qk_out_grad;
+};
+
+template <typename T>
+class FMHAGateRef {
+ public:
+  FMHAGateRef(const platform::CUDADeviceContext& dev_ctx, bool merge_qkv)
+      : dev_ctx_(dev_ctx), merge_qkv_(merge_qkv) {}
+
+  void ComputeForward(const Tensor* nonbatched_bias, const Tensor* src_mask,
+                      Tensor* q_transpose_out, Tensor* k_transpose_out,
+                      Tensor* v_transpose_out, Tensor* qkv_transpose_out,
+                      Tensor* softmax_out, Tensor* fmha_out,
+                      GateAttentionConfig<T>* config) {
+    T* q_ptr = nullptr;
+    T* k_ptr = nullptr;
+    T* v_ptr = nullptr;
+    if (merge_qkv_) {
+      // qkv_transpose_out = transpose(qkv_out)
+      PADDLE_ENFORCE_NOT_NULL(
+          qkv_transpose_out,
+          platform::errors::NotFound("The input qkv_transpose_out can not be "
+                                     "nullptr when merge_qkv is true."));
+
+      Tensor* qkv_out = config->GetQKVOut(dev_ctx_);
+      ComputeQKVTransposeForward(*qkv_out, qkv_transpose_out);
+      config->ClearQKVOut();
+
+      // q_size == k_size
+      int64_t q_size = config->GetQuerySize();
+      q_ptr = qkv_transpose_out->data<T>();
+      k_ptr = q_ptr + q_size;
+      v_ptr = k_ptr + q_size;
+    } else {
+      PADDLE_ENFORCE_NOT_NULL(
+          q_transpose_out,
+          platform::errors::NotFound("The input q_transpose_out can not be "
+                                     "nullptr when merge_qkv is false."));
+      PADDLE_ENFORCE_NOT_NULL(
+          k_transpose_out,
+          platform::errors::NotFound("The input k_transpose_out can not be "
+                                     "nullptr when merge_qkv is false."));
+      PADDLE_ENFORCE_NOT_NULL(
+          v_transpose_out,
+          platform::errors::NotFound("The input v_transpose_out can not be "
+                                     "nullptr when merge_qkv is false."));
+
+      Tensor* query_out = config->GetQueryOut(dev_ctx_);
+      Tensor* key_out = config->GetKeyOut(dev_ctx_);
+      Tensor* value_out = config->GetValueOut(dev_ctx_);
+      ComputeQKVTransposeForward(*query_out, *key_out, *value_out,
+                                 q_transpose_out, k_transpose_out,
+                                 v_transpose_out);
+
+      // q_size != k_size
+      q_ptr = q_transpose_out->data<T>();
+      k_ptr = k_transpose_out->data<T>();
+      v_ptr = v_transpose_out->data<T>();
+    }
+
+    // qk_out = BatchedGEMM(Q, K^T)
+    // [batch_size, seq_len_m, num_heads, seq_len_r, key_dim] *
+    //                [batch_size, seq_len_m, num_heads, m_size, key_dim]
+    // -> [batch_size, seq_len_m, num_heads, seq_len_r, m_size]
+    Tensor* qk_out = config->GetQKOut(dev_ctx_, softmax_out);
+    T* qk_out_ptr = qk_out->data<T>();
+
+    int64_t gemm_batch_size =
+        config->batch_size * config->seq_len_m * config->num_heads;
+    int64_t gemm_m = config->seq_len_r;
+    int64_t gemm_n = config->m_size;
+    int64_t gemm_k = config->key_dim;
+
+    T alpha = static_cast<T>(1.0 / sqrt(config->key_dim));
+    ComputeBatchedGEMM(q_ptr, k_ptr, qk_out_ptr, false, true, gemm_m, gemm_n,
+                       gemm_k, gemm_batch_size, alpha);
+
+    // softmax_out = softmax(qk_out + nonbatched_bias + src_mask)
+    ComputeBiasMaskSoftmaxForward(nonbatched_bias, src_mask, qk_out,
+                                  softmax_out);
+    config->ClearQKOut();
+
+    // qktv_out = BatchedGEMM(softmax_out, V)
+    // [batch_size, seq_len_m, num_heads, seq_len_r, m_size] *
+    //               [batch_size, seq_len_m, num_heads, m_size, key_dim]
+    // -> [batch_size, seq_len_m, num_heads, seq_len_r, key_dim]
+    Tensor qktv_out;
+    qktv_out.Resize(config->qktv_out_dims);
+    T* qktv_out_ptr = qktv_out.mutable_data<T>(dev_ctx_.GetPlace());
+
+    gemm_m = config->seq_len_r;
+    gemm_n = config->key_dim;
+    gemm_k = config->m_size;
+
+    T* softmax_out_ptr = softmax_out->data<T>();
+    ComputeBatchedGEMM(softmax_out_ptr, v_ptr, qktv_out_ptr, false, false,
+                       gemm_m, gemm_n, gemm_k, gemm_batch_size);
+
+    // fmha_out = transpose(qktv_out)
+    ComputeQKTVTransposeForward(qktv_out, fmha_out);
+  }
+
+  void ComputeBackward(const Tensor* q_transpose_out,
+                       const Tensor* k_transpose_out,
+                       const Tensor* v_transpose_out,
+                       const Tensor* qkv_transpose_out,
+                       const Tensor* softmax_out, const Tensor* fmha_out_grad,
+                       Tensor* src_mask_grad, Tensor* nonbatched_bias_grad,
+                       GateAttentionGradConfig<T>* config) {
+    const T* q_ptr = nullptr;
+    const T* k_ptr = nullptr;
+    const T* v_ptr = nullptr;
+
+    T* q_grad_ptr = nullptr;
+    T* k_grad_ptr = nullptr;
+    T* v_grad_ptr = nullptr;
+
+    Tensor q_transpose_out_grad;
+    Tensor k_transpose_out_grad;
+    Tensor v_transpose_out_grad;
+    Tensor qkv_transpose_out_grad;
+    if (merge_qkv_) {
+      PADDLE_ENFORCE_NOT_NULL(
+          qkv_transpose_out,
+          platform::errors::NotFound("The input qkv_transpose_out can not be "
+                                     "nullptr when merge_qkv is true."));
+
+      int64_t q_size = config->GetQuerySize();
+      q_ptr = qkv_transpose_out->data<T>();
+      k_ptr = q_ptr + q_size;
+      v_ptr = k_ptr + q_size;
+
+      qkv_transpose_out_grad.Resize(config->qkv_transpose_out_dims);
+
+      q_grad_ptr = qkv_transpose_out_grad.mutable_data<T>(dev_ctx_.GetPlace());
+      k_grad_ptr = q_grad_ptr + q_size;
+      v_grad_ptr = k_grad_ptr + q_size;
+    } else {
+      PADDLE_ENFORCE_NOT_NULL(
+          q_transpose_out,
+          platform::errors::NotFound("The input q_transpose_out can not be "
+                                     "nullptr when merge_qkv is false."));
+      PADDLE_ENFORCE_NOT_NULL(
+          k_transpose_out,
+          platform::errors::NotFound("The input k_transpose_out can not be "
+                                     "nullptr when merge_qkv is false."));
+      PADDLE_ENFORCE_NOT_NULL(
+          v_transpose_out,
+          platform::errors::NotFound("The input v_transpose_out can not be "
+                                     "nullptr when merge_qkv is false."));
+
+      q_ptr = q_transpose_out->data<T>();
+      k_ptr = k_transpose_out->data<T>();
+      v_ptr = v_transpose_out->data<T>();
+
+      q_transpose_out_grad.Resize(config->q_transpose_out_dims);
+      k_transpose_out_grad.Resize(config->kv_transpose_out_dims);
+      v_transpose_out_grad.Resize(config->kv_transpose_out_dims);
+
+      q_grad_ptr = q_transpose_out_grad.mutable_data<T>(dev_ctx_.GetPlace());
+      k_grad_ptr = k_transpose_out_grad.mutable_data<T>(dev_ctx_.GetPlace());
+      v_grad_ptr = v_transpose_out_grad.mutable_data<T>(dev_ctx_.GetPlace());
+    }
+
+    Tensor softmax_out_grad;
+    softmax_out_grad.Resize(config->softmax_out_dims);
+    softmax_out_grad.mutable_data<T>(dev_ctx_.GetPlace());
+
+    int64_t gemm_batch_size =
+        config->batch_size * config->seq_len_m * config->num_heads;
+    {
+      // Forward: fmha_out = transpose(qktv_out)
+      Tensor qktv_out_grad;
+      qktv_out_grad.Resize(config->qktv_out_dims);
+      T* qktv_out_grad_ptr = qktv_out_grad.mutable_data<T>(dev_ctx_.GetPlace());
+      ComputeQKTVTransposeBackward(*fmha_out_grad, &qktv_out_grad);
+
+      // Forward: qktv_out = BatchedGEMM(softmax_out, V)
+      // Backward:
+      //  V_grad = BatchedGEMM(softmax_out^T, qktv_out_grad) (dy = x^T * dout)
+      int64_t gemm_m = config->m_size;
+      int64_t gemm_n = config->key_dim;
+      int64_t gemm_k = config->seq_len_r;
+
+      const T* softmax_out_ptr = softmax_out->data<T>();
+      ComputeBatchedGEMM(softmax_out_ptr, qktv_out_grad_ptr, v_grad_ptr, true,
+                         false, gemm_m, gemm_n, gemm_k, gemm_batch_size);
+
+      // Backward: softmax_out_grad = qktv_out_grad * V^T (dx = dout * y^T)
+      gemm_m = config->seq_len_r;
+      gemm_n = config->m_size;
+      gemm_k = config->key_dim;
+
+      T* softmax_out_grad_ptr = softmax_out_grad.data<T>();
+      ComputeBatchedGEMM(qktv_out_grad_ptr, v_ptr, softmax_out_grad_ptr, false,
+                         true, gemm_m, gemm_n, gemm_k, gemm_batch_size);
+    }
+
+    Tensor* qk_out_grad = config->GetQKOutGrad(dev_ctx_, &softmax_out_grad);
+    ComputeBiasMaskSoftmaxBackward(&softmax_out_grad, softmax_out,
+                                   src_mask_grad, qk_out_grad,
+                                   nonbatched_bias_grad);
+
+    // Forward: qk_out = BatchedGEMM(Q, K^T)
+    // Backward: k_grad = BatchedGEMM(qk_out_grad^T, Q) (dy = dout^t * x)
+    int64_t gemm_m = config->m_size;
+    int64_t gemm_n = config->key_dim;
+    int64_t gemm_k = config->seq_len_r;
+    T alpha = static_cast<T>(1.0 / sqrt(config->key_dim));
+
+    T* qk_out_grad_ptr = qk_out_grad->data<T>();
+    ComputeBatchedGEMM(qk_out_grad_ptr, q_ptr, k_grad_ptr, true, false, gemm_m,
+                       gemm_n, gemm_k, gemm_batch_size, alpha);
+
+    // Backward: q_grad = BatchedGEMM(qk_out_grad, K) (dx = dout * y)
+    gemm_m = config->seq_len_r;
+    gemm_n = config->key_dim;
+    gemm_k = config->m_size;
+    ComputeBatchedGEMM(qk_out_grad_ptr, k_ptr, q_grad_ptr, false, false, gemm_m,
+                       gemm_n, gemm_k, gemm_batch_size, alpha);
+
+    if (merge_qkv_) {
+      Tensor* qkv_out_grad = config->GetQKVOutGrad(dev_ctx_);
+      ComputeQKVTransposeBackward(qkv_transpose_out_grad, qkv_out_grad);
+    } else {
+      Tensor* q_out_grad = config->GetQueryOutGrad(dev_ctx_);
+      Tensor* k_out_grad = config->GetKeyOutGrad(dev_ctx_);
+      Tensor* v_out_grad = config->GetValueOutGrad(dev_ctx_);
+      ComputeQKVTransposeBackward(q_transpose_out_grad, k_transpose_out_grad,
+                                  v_transpose_out_grad, q_out_grad, k_out_grad,
+                                  v_out_grad);
+    }
+  }
+
+  void ComputeQKVTransposeForward(const Tensor& q_out, const Tensor& k_out,
+                                  const Tensor& v_out, Tensor* q_transpose_out,
+                                  Tensor* k_transpose_out,
+                                  Tensor* v_transpose_out) {
+    int ndims = 5;
+    std::vector<int> perm = {0, 1, 3, 2, 4};
+    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, q_out, perm, q_transpose_out);
+    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, k_out, perm, k_transpose_out);
+    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, v_out, perm, v_transpose_out);
+  }
+
+  void ComputeQKVTransposeBackward(const Tensor& q_transpose_out_grad,
+                                   const Tensor& k_transpose_out_grad,
+                                   const Tensor& v_transpose_out_grad,
+                                   Tensor* q_out_grad, Tensor* k_out_grad,
+                                   Tensor* v_out_grad) {
+    int ndims = 5;
+    std::vector<int> perm = {0, 1, 3, 2, 4};
+    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, q_transpose_out_grad, perm,
+                                q_out_grad);
+    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, k_transpose_out_grad, perm,
+                                k_out_grad);
+    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, v_transpose_out_grad, perm,
+                                v_out_grad);
+  }
+
+  // [batch_size, seq_len_m, seq_len_r, 3, num_heads, key_dim] ->
+  //         [3, batch_size, seq_len_m, num_heads, seq_len_r, key_dim]
+  void ComputeQKVTransposeForward(const Tensor& qkv_out,
+                                  Tensor* qkv_transpose_out) {
+    int ndims = 6;
+    std::vector<int> perm = {3, 0, 1, 4, 2, 5};
+    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, qkv_out, perm,
+                                qkv_transpose_out);
+  }
+
+  void ComputeQKVTransposeBackward(const Tensor& qkv_transpose_out_grad,
+                                   Tensor* qkv_out_grad) {
+    int ndims = 6;
+    std::vector<int> perm = {1, 2, 4, 0, 3, 5};
+    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, qkv_transpose_out_grad, perm,
+                                qkv_out_grad);
+  }
+
+  // [batch_size, seq_len_m, num_head, seq_len_r, c] ->
+  //         [batch_size, seq_len_m, seq_len_r, num_head, c]
+  void ComputeQKTVTransposeForward(const Tensor& qktv_out, Tensor* fmha_out) {
+    int ndims = 5;
+    std::vector<int> perm = {0, 1, 3, 2, 4};
+    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, qktv_out, perm, fmha_out);
+  }
+
+  void ComputeQKTVTransposeBackward(const Tensor& fmha_out_grad,
+                                    Tensor* qktv_out_grad) {
+    int ndims = 5;
+    std::vector<int> perm = {0, 1, 3, 2, 4};
+    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, fmha_out_grad, perm,
+                                qktv_out_grad);
+  }
+
+  // qk_out = qk_out + nonbatched_bias + src_mask
+  // softmax_out = softmax(src_mask_out)
+  void ComputeBiasMaskSoftmaxForward(const Tensor* nonbatched_bias,
+                                     const Tensor* src_mask, Tensor* qk_out,
+                                     Tensor* softmax_out) {
+    if (nonbatched_bias) {
+      std::vector<const Tensor*> ins = {qk_out, nonbatched_bias, src_mask};
+      std::vector<Tensor*> outs = {qk_out};
+      phi::funcs::BroadcastKernel<ElementwiseType::kTernary, T, T>(
+          dev_ctx_, ins, &outs, -1, TernaryAddFunctor<T>());
+    } else {
+      std::vector<const Tensor*> ins = {qk_out, src_mask};
+      std::vector<Tensor*> outs = {qk_out};
+      phi::funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(
+          dev_ctx_, ins, &outs, -1, phi::funcs::AddFunctor<T>());
+    }
+    phi::SoftmaxForwardCUDAKernelDriver<T>(dev_ctx_, *qk_out, -1, softmax_out);
+  }
+
+  // src_mask_out = qk_out + nonbatched_bias + src_mask
+  // softmax_out = softmax(src_mask_out)
+  void ComputeBiasMaskSoftmaxBackward(const Tensor* softmax_out_grad,
+                                      const Tensor* softmax_out,
+                                      Tensor* src_mask_grad,
+                                      Tensor* qk_out_grad,
+                                      Tensor* nonbatched_bias_grad) {
+    PADDLE_ENFORCE_NOT_NULL(
+        qk_out_grad,
+        platform::errors::NotFound("The qk_out_grad can not be nullptr."));
+
+    PADDLE_ENFORCE_EQ(qk_out_grad->dims(), softmax_out->dims(),
+                      platform::errors::InvalidArgument(
+                          "The shape of qk_out_grad and softmax_out is "
+                          "expected to be the same. But recieved qk_out_grad's "
+                          "shape = %s, softmax_out's shape = %s.",
+                          qk_out_grad->dims(), softmax_out->dims()));
+
+    PADDLE_ENFORCE_EQ(src_mask_grad, nullptr,
+                      platform::errors::InvalidArgument(
+                          "src_mask_grad is expected to be nullptr."));
+
+    phi::SoftmaxBackwardCUDAKernelDriver<T>(dev_ctx_, *softmax_out,
+                                            *softmax_out_grad, -1, qk_out_grad);
+
+    // [1, bs, num_head, seq_l, seq_l] -> [bs, num_head, seq_l, seq_l]
+    if (nonbatched_bias_grad) {
+      gpuStream_t stream = dev_ctx_.stream();
+      TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+          dev_ctx_, *qk_out_grad, nonbatched_bias_grad,
+          kps::IdentityFunctor<T>(), {0, 1}, stream);
+    }
+  }
+
+ private:
+  void ComputeBatchedGEMM(const T* a_ptr, const T* b_ptr, T* c_ptr,
+                          bool trans_a, bool trans_b, int64_t m, int64_t n,
+                          int64_t k, int64_t batch_size,
+                          T alpha = static_cast<T>(1.0),
+                          T beta = static_cast<T>(0.0)) {
+    CBLAS_TRANSPOSE cblas_trans_a = trans_a ? CblasTrans : CblasNoTrans;
+    CBLAS_TRANSPOSE cblas_trans_b = trans_b ? CblasTrans : CblasNoTrans;
+    int64_t stride_a = m * k;
+    int64_t stride_b = k * n;
+
+    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
+    blas.BatchedGEMM(cblas_trans_a, cblas_trans_b, m, n, k, alpha, a_ptr, b_ptr,
+                     beta, c_ptr, batch_size, stride_a, stride_b);
+  }
+
+  const platform::CUDADeviceContext& dev_ctx_;
+  bool merge_qkv_;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cc b/paddle/fluid/operators/fused/fused_gate_attention_op.cc
new file mode 100644
index 0000000000000..ba9dbd82e3dcc
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cc
@@ -0,0 +1,317 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+class FusedGateAttentionOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("Query"), "Input", "Query",
+                   "fused_gate_attention");
+    OP_INOUT_CHECK(ctx->HasInput("OutLinearWeight"), "Input", "OutLinearWeight",
+                   "fused_gate_attention");
+    OP_INOUT_CHECK(ctx->HasInput("OutLinearBias"), "Input", "OutLinearBias",
+                   "fused_gate_attention");
+
+    OP_INOUT_CHECK(ctx->HasOutput("SoftmaxOut"), "Output", "SoftmaxOut",
+                   "fused_gate_attention");
+    OP_INOUT_CHECK(ctx->HasOutput("FMHAOut"), "Output", "FMHAOut",
+                   "fused_gate_attention");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
+                   "fused_gate_attention");
+
+    auto input_q_dims = ctx->GetInputDim("Query");
+    int batch_size = input_q_dims[0];
+    int seq_len_m = input_q_dims[1];
+    int seq_len_r = input_q_dims[2];
+
+    int num_head, m_size, key_dim;
+    if (ctx->Attrs().Get<bool>("merge_qkv")) {
+      // QKV's input: [batch_size, seq_len_m, seq_len_r, qkv_dim]
+      // QKV's weight: [3, num_head, key_dim, qkv_dim]
+      OP_INOUT_CHECK(ctx->HasInput("QKVWeight"), "Input", "QKVWeight",
+                     "fused_gate_attention");
+      OP_INOUT_CHECK(ctx->HasOutput("QKVTransposeOut"), "Output",
+                     "QKVTransposeOut", "fused_gate_attention");
+
+      auto qkv_w_dims = ctx->GetInputDim("QKVWeight");
+
+      num_head = qkv_w_dims[1];
+      key_dim = qkv_w_dims[2];
+      m_size = seq_len_r;
+
+      ctx->SetOutputDim("QKVTransposeOut", {3, batch_size, seq_len_m, num_head,
+                                            seq_len_r, key_dim});
+    } else {
+      OP_INOUT_CHECK(ctx->HasInput("QueryWeight"), "Input", "QueryWeight",
+                     "fused_gate_attention");
+      OP_INOUT_CHECK(ctx->HasInput("KeyWeight"), "Input", "KeyWeight",
+                     "fused_gate_attention");
+      OP_INOUT_CHECK(ctx->HasInput("ValueWeight"), "Input", "ValueWeight",
+                     "fused_gate_attention");
+
+      auto input_k_dims = ctx->GetInputDim("Key");
+      auto q_w_dims = ctx->GetInputDim("QueryWeight");
+
+      num_head = q_w_dims[1];
+      key_dim = q_w_dims[2];
+      m_size = input_k_dims[2];
+
+      ctx->SetOutputDim("QueryTransposeOut",
+                        {batch_size, seq_len_m, num_head, seq_len_r, key_dim});
+      ctx->SetOutputDim("KeyTransposeOut",
+                        {batch_size, seq_len_m, num_head, m_size, key_dim});
+      ctx->SetOutputDim("ValueTransposeOut",
+                        {batch_size, seq_len_m, num_head, m_size, key_dim});
+    }
+
+    ctx->SetOutputDim("SoftmaxOut",
+                      {batch_size, seq_len_m, num_head, seq_len_r, m_size});
+    ctx->SetOutputDim("FMHAOut",
+                      {batch_size, seq_len_m, seq_len_r, num_head, key_dim});
+
+    if (ctx->Attrs().Get<bool>("has_gating")) {
+      OP_INOUT_CHECK(ctx->HasInput("GateWeight"), "Input", "GateWeight",
+                     "fused_gate_attention");
+      OP_INOUT_CHECK(ctx->HasInput("GateBias"), "Input", "GateBias",
+                     "fused_gate_attention");
+      ctx->SetOutputDim("GateOut",
+                        {batch_size, seq_len_m, seq_len_r, num_head, key_dim});
+    }
+
+    ctx->SetOutputDim("Out", ctx->GetInputDim("Query"));
+  }
+};
+
+class FusedGateAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Query", "The query tensor.");
+    AddInput("Key", "The key tensor.").AsDispensable();
+    AddInput("QueryWeight", "(optional) The query weight tensor.")
+        .AsDispensable();
+    AddInput("KeyWeight", "(optional)  The key weight tensor.").AsDispensable();
+    AddInput("ValueWeight", "(optional)  The value weight tensor.")
+        .AsDispensable();
+    AddInput("QKVWeight", "(optional)  The qkv weight tensor.").AsDispensable();
+    AddInput("NonbatchedBias", "(optional) The nonbatchedBias tensor.")
+        .AsDispensable();
+    AddInput("SrcMask", "The attention mask tensor in fmha.");
+    AddInput("GateWeight", "(optional) The gate weight tensor.")
+        .AsDispensable();
+    AddInput("GateBias", "(optional) The gate bias tensor.").AsDispensable();
+    AddInput("OutLinearWeight", "The out_linear weight tensor.");
+    AddInput("OutLinearBias", "The out_linear bias tensor.");
+    AddOutput("QueryTransposeOut", "The transposed result of query matmul.")
+        .AsIntermediate()
+        .AsDispensable();
+    AddOutput("KeyTransposeOut", "The transposed result of key matmul.")
+        .AsIntermediate()
+        .AsDispensable();
+    AddOutput("ValueTransposeOut", "The transposed result of value matmul.")
+        .AsIntermediate()
+        .AsDispensable();
+    AddOutput("QKVTransposeOut", "The transposed result of merged QKV matmul.")
+        .AsIntermediate()
+        .AsDispensable();
+    AddOutput("SoftmaxOut", "Result in fmha.").AsIntermediate();
+    AddOutput("FMHAOut", "Result in fmha.").AsIntermediate();
+    AddOutput("GateOut", "Result of the gating module.")
+        .AsIntermediate()
+        .AsDispensable();
+    AddOutput("Out", "Result after attention.");
+    AddAttr<bool>("has_gating",
+                  "if true, the attention op uses gate architecure, "
+                  "[default true].")
+        .SetDefault(true);
+    AddAttr<bool>("merge_qkv",
+                  "if true, calculation with merged qkv, "
+                  "[default true].")
+        .SetDefault(true);
+    AddComment(R"DOC(
+  Add fused attention op whose logic is as follows:
+  {
+    q = paddle.einsum('nbqa,ahc->nbqhc', q_data, self.query_w) 
+    k = paddle.einsum('nbka,ahc->nbkhc', m_data, self.key_w)
+    v = paddle.einsum('nbka,ahc->nbkhc', m_data, self.value_w)
+
+    logits = paddle.einsum('nbqhc,nbkhc->nbhqk', q * c , k) + bias
+    weights = nn.functional.softmax(logits)
+    weighted_avg = paddle.einsum('nbhqk,nbkhc->nbqhc', weights, v)
+    if nonbatched_bias is not None:
+      logits += paddle.unsqueeze(nonbatched_bias, axis=1)
+
+    if self.gating:
+        gate_values = paddle.einsum('nbqc,chv->nbqhv', q_data,
+                                    self.gating_w) + self.gating_b
+        gate_values_1 = nn.functional.sigmoid(gate_values)
+        weighted_avg *= gate_values_1
+    
+    output = paddle.einsum('nbqhc,hco->nbqo', weighted_avg,
+                          self.output_w) + self.output_b
+                
+  }
+    )DOC");
+  }
+};
+
+class FusedGateAttentionGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("Query"), "Input", "Query",
+                   "fused_gate_attention_grad");
+    if (ctx->HasOutput(framework::GradVarName("Query"))) {
+      ctx->SetOutputDim(framework::GradVarName("Query"),
+                        ctx->GetInputDim("Query"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Key"))) {
+      ctx->SetOutputDim(framework::GradVarName("Key"), ctx->GetInputDim("Key"));
+    }
+
+    if (ctx->Attrs().Get<bool>("merge_qkv")) {
+      OP_INOUT_CHECK(ctx->HasInput("QKVWeight"), "Input", "QKVWeight",
+                     "fused_gate_attention_arad");
+      ctx->SetOutputDim(framework::GradVarName("QKVWeight"),
+                        ctx->GetInputDim("QKVWeight"));
+    } else {
+      OP_INOUT_CHECK(ctx->HasInput("QueryWeight"), "Input", "QueryWeight",
+                     "fused_aate_attention_arad");
+      OP_INOUT_CHECK(ctx->HasInput("KeyWeight"), "Input", "KeyWeight",
+                     "fused_aate_attention_arad");
+      OP_INOUT_CHECK(ctx->HasInput("ValueWeight"), "Input", "ValueWeight",
+                     "fused_aate_attention_arad");
+
+      for (auto& name : {"QueryWeight", "KeyWeight", "ValueWeight"}) {
+        ctx->SetOutputDim(framework::GradVarName(name), ctx->GetInputDim(name));
+      }
+    }
+
+    OP_INOUT_CHECK(ctx->HasInput("OutLinearWeight"), "Input", "OutLinearWeight",
+                   "fused_aate_attention_arad");
+
+    if (ctx->Attrs().Get<bool>("has_gating")) {
+      for (auto& name : {"GateWeight", "GateBias", "GateOut"}) {
+        ctx->SetOutputDim(framework::GradVarName(name), ctx->GetInputDim(name));
+      }
+    }
+
+    if (ctx->HasOutput(framework::GradVarName("NonbatchedBias"))) {
+      ctx->SetOutputDim(framework::GradVarName("NonbatchedBias"),
+                        ctx->GetInputDim("NonbatchedBias"));
+    }
+
+    ctx->SetOutputDim(framework::GradVarName("FMHAOut"),
+                      ctx->GetInputDim("FMHAOut"));
+
+    ctx->SetOutputDim(framework::GradVarName("OutLinearWeight"),
+                      ctx->GetInputDim("OutLinearWeight"));
+    ctx->SetOutputDim(framework::GradVarName("OutLinearBias"),
+                      ctx->GetInputDim("OutLinearBias"));
+  }
+};
+
+template <typename T>
+class FusedGateAttentionGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("fused_gate_attention_grad");
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+
+    op->SetInput("Query", this->Input("Query"));
+    op->SetOutput(framework::GradVarName("Query"), this->InputGrad("Query"));
+
+    op->SetAttrMap(this->Attrs());
+    bool merge_qkv = BOOST_GET_CONST(bool, op->GetAttr("merge_qkv"));
+    if (merge_qkv) {
+      op->SetInput("QKVWeight", this->Input("QKVWeight"));
+      op->SetOutput(framework::GradVarName("QKVWeight"),
+                    this->InputGrad("QKVWeight"));
+      op->SetInput("QKVTransposeOut", this->Output("QKVTransposeOut"));
+    } else {
+      op->SetInput("Key", this->Input("Key"));
+      op->SetOutput(framework::GradVarName("Key"), this->InputGrad("Key"));
+
+      for (auto& name : {"QueryWeight", "KeyWeight", "ValueWeight"}) {
+        op->SetInput(name, this->Input(name));
+        op->SetOutput(framework::GradVarName(name), this->InputGrad(name));
+      }
+
+      for (auto& name :
+           {"QueryTransposeOut", "KeyTransposeOut", "ValueTransposeOut"}) {
+        op->SetInput(name, this->Output(name));
+      }
+    }
+
+    op->SetInput("FMHAOut", this->Output("FMHAOut"));
+    op->SetOutput(framework::GradVarName("FMHAOut"),
+                  this->OutputGrad("FMHAOut"));
+
+    if (this->HasInput("NonbatchedBias")) {
+      op->SetInput("NonbatchedBias", this->Input("NonbatchedBias"));
+      op->SetOutput(framework::GradVarName("NonbatchedBias"),
+                    this->InputGrad("NonbatchedBias"));
+    }
+
+    op->SetInput("SoftmaxOut", this->Output("SoftmaxOut"));
+
+    bool has_gating = BOOST_GET_CONST(bool, op->GetAttr("has_gating"));
+    if (has_gating) {
+      op->SetInput("GateWeight", this->Input("GateWeight"));
+      op->SetOutput(framework::GradVarName("GateWeight"),
+                    this->InputGrad("GateWeight"));
+
+      op->SetInput("GateBias", this->Input("GateBias"));
+      op->SetOutput(framework::GradVarName("GateBias"),
+                    this->InputGrad("GateBias"));
+
+      op->SetInput("GateOut", this->Output("GateOut"));
+      op->SetOutput(framework::GradVarName("GateOut"),
+                    this->OutputGrad("GateOut"));
+    }
+
+    op->SetInput("OutLinearWeight", this->Input("OutLinearWeight"));
+    op->SetOutput(framework::GradVarName("OutLinearWeight"),
+                  this->InputGrad("OutLinearWeight"));
+
+    op->SetInput("OutLinearBias", this->Input("OutLinearBias"));
+    op->SetOutput(framework::GradVarName("OutLinearBias"),
+                  this->InputGrad("OutLinearBias"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    fused_gate_attention, ops::FusedGateAttentionOp,
+    ops::FusedGateAttentionOpMaker,
+    ops::FusedGateAttentionGradOpMaker<paddle::framework::OpDesc>,
+    ops::FusedGateAttentionGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(fused_gate_attention_grad, ops::FusedGateAttentionGradOp);
diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cu b/paddle/fluid/operators/fused/fused_gate_attention_op.cu
new file mode 100644
index 0000000000000..b1badf72557ae
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cu
@@ -0,0 +1,488 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/fused/attn_gemm.h"
+#include "paddle/fluid/operators/fused/fused_gate_attention.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+struct SigmoidMultiplyFunctor {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // sigmoid(x) = 1 / (1 + exp(-x))
+  // out = sigmoid(x) * y
+  inline HOSTDEVICE T operator()(T x, T y) const {
+    MPType x_mp = static_cast<MPType>(x);
+    T sigmoid_out = static_cast<T>(one / (one + exp(-x_mp)));
+    return sigmoid_out * y;
+  }
+};
+
+template <typename T>
+struct SigmoidMultiplyGradFunctor {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // Gradient of Multiply:
+  //  dx = dout * y
+  //  dy = dout * x
+  // Gradient of Sigmoid: dx = dout * out * (1 - out)
+  inline HOSTDEVICE phi::Array<T, 2> operator()(const T dout, const T x,
+                                                T y) const {
+    MPType x_mp = static_cast<MPType>(x);
+    T sigmoid_out = static_cast<T>(one / (one + exp(-x_mp)));
+    T d_sigmoid_out = dout * y;
+    phi::Array<T, 2> outs;
+    outs[0] = d_sigmoid_out * sigmoid_out *
+              (static_cast<T>(1.0f) - sigmoid_out);  // dx
+    outs[1] = dout * sigmoid_out;                    // dy
+    return outs;
+  }
+};
+
+template <typename T>
+void ComputeMergedQKVMatmulForward(const framework::ExecutionContext &ctx,
+                                   const GateAttentionConfig<T> &config,
+                                   const Tensor *query, Tensor *qkv_out) {
+  // query: shape=[batch_size, seq_len_m, seq_len_r, qkv_dim]
+  // qkv_weight: shape=[3, num_heads, key_dim, qkv_dim]
+  // qkv_out: shape=[batch_size, seq_len_m, seq_len_r, 3, num_heads, key_dim]
+  auto *qkv_weight = ctx.Input<Tensor>("QKVWeight");
+
+  // qkv_out = GEMM(query, qkv_weight^T)
+  int m = config.batch_size * config.seq_len_m * config.seq_len_r;
+  int n = 3 * config.num_heads * config.key_dim;
+  int k = config.q_dim;
+  auto qkv_compute =
+      AttnMatMul<T>(ctx.cuda_device_context(), false, true, m, n, k, false);
+  qkv_compute.ComputeForward(qkv_weight, query, nullptr, qkv_out, nullptr);
+}
+
+template <typename T>
+Tensor *ComputeMergedQKVMatmulBackward(const framework::ExecutionContext &ctx,
+                                       const GateAttentionGradConfig<T> &config,
+                                       const Tensor *query,
+                                       const Tensor *qkv_out_grad,
+                                       Tensor *query_grad, bool use_addto) {
+  auto *qkv_weight = ctx.Input<Tensor>("QKVWeight");
+  auto *qkv_weight_grad =
+      ctx.Output<Tensor>(framework::GradVarName("QKVWeight"));
+  qkv_weight_grad->mutable_data<T>(ctx.GetPlace());
+
+  // Gradient of GEMM(query, qkv_weight)
+  int m = config.batch_size * config.seq_len_m * config.seq_len_r;
+  int n = 3 * config.num_heads * config.key_dim;
+  int k = config.q_dim;
+  auto qkv_compute =
+      AttnMatMul<T>(ctx.cuda_device_context(), false, true, m, n, k, false);
+  qkv_compute.ComputeBackward(query, qkv_weight, qkv_out_grad, query_grad,
+                              qkv_weight_grad, nullptr, use_addto);
+  return query_grad;
+}
+
+template <typename T>
+void ComputeSeparatedQKVMatmulForward(const framework::ExecutionContext &ctx,
+                                      const GateAttentionConfig<T> &config,
+                                      const Tensor *query, const Tensor *key,
+                                      Tensor *query_out, Tensor *key_out,
+                                      Tensor *value_out) {
+  auto *query_weight = ctx.Input<Tensor>("QueryWeight");
+  auto *key_weight = ctx.Input<Tensor>("KeyWeight");
+  auto *value_weight = ctx.Input<Tensor>("ValueWeight");
+
+  // query_out = GEMM(query, query_weight)
+  // query: shape=[batch_size, seq_len_m, seq_len_r, q_dim]
+  // query_weight: shape=[q_dim, num_heads, key_dim]
+  // query_out: shape=[batch_size, seq_len_m, seq_len_r, num_heads, key_dim]
+  int q_m = config.batch_size * config.seq_len_m * config.seq_len_r;
+  int q_n = config.num_heads * config.key_dim;
+  int q_k = config.q_dim;
+  auto q_compute = AttnMatMul<T>(ctx.cuda_device_context(), false, false, q_m,
+                                 q_n, q_k, false);
+  q_compute.ComputeForward(query_weight, query, nullptr, query_out, nullptr);
+
+  // k_out = GEMM(key, key_weight)
+  // key: shape=[batch_size, seq_len_m, m_size, kv_dim]
+  // key_weight: shape=[kv_dim, num_heads, key_dim]
+  // key_out: shape=[batch_size, seq_len_m, m_size, num_heads, key_dim]
+  int kv_m = config.batch_size * config.seq_len_m * config.m_size;
+  int kv_n = config.num_heads * config.key_dim;
+  int kv_k = config.kv_dim;
+  auto kv_compute = AttnMatMul<T>(ctx.cuda_device_context(), false, false, kv_m,
+                                  kv_n, kv_k, false);
+  kv_compute.ComputeForward(key_weight, key, nullptr, key_out, nullptr);
+
+  // value_out = GEMM(value, value_weight)
+  kv_compute.ComputeForward(value_weight, key, nullptr, value_out, nullptr);
+}
+
+template <typename T>
+Tensor *ComputeSeparatedQKVMatmulBackward(
+    const framework::ExecutionContext &ctx,
+    const GateAttentionGradConfig<T> &config, const Tensor *query,
+    const Tensor *key, const Tensor *query_out_grad, const Tensor *key_out_grad,
+    const Tensor *value_out_grad, Tensor *query_grad, Tensor *key_grad,
+    bool use_addto) {
+  // Gradient of GEMM(key, k_weight)
+  const auto *key_weight = ctx.Input<Tensor>("KeyWeight");
+  auto *key_weight_grad =
+      ctx.Output<Tensor>(framework::GradVarName("KeyWeight"));
+  key_weight_grad->mutable_data<T>(ctx.GetPlace());
+
+  int kv_m = config.batch_size * config.seq_len_m * config.m_size;
+  int kv_n = config.num_heads * config.key_dim;
+  int kv_k = config.kv_dim;
+  auto kv_compute = AttnMatMul<T>(ctx.cuda_device_context(), false, false, kv_m,
+                                  kv_n, kv_k, false);
+  kv_compute.ComputeBackward(key, key_weight, key_out_grad, key_grad,
+                             key_weight_grad, nullptr, false);
+
+  // Gradient of GEMM(value, v_weight)
+  auto *value_weight = ctx.Input<Tensor>("ValueWeight");
+  auto *value_weight_grad =
+      ctx.Output<Tensor>(framework::GradVarName("ValueWeight"));
+  value_weight_grad->mutable_data<T>(ctx.GetPlace());
+
+  kv_compute.ComputeBackward(key, value_weight, value_out_grad, key_grad,
+                             value_weight_grad, nullptr, true);
+
+  // Gradient of GEMM(query, query_weight)
+  const auto *query_weight = ctx.Input<Tensor>("QueryWeight");
+  auto *query_weight_grad =
+      ctx.Output<Tensor>(framework::GradVarName("QueryWeight"));
+  query_weight_grad->mutable_data<T>(ctx.GetPlace());
+
+  int q_m = config.batch_size * config.seq_len_m * config.seq_len_r;
+  int q_n = config.num_heads * config.key_dim;
+  int q_k = config.q_dim;
+  auto q_compute = AttnMatMul<T>(ctx.cuda_device_context(), false, false, q_m,
+                                 q_n, q_k, false);
+  q_compute.ComputeBackward(query, query_weight, query_out_grad, query_grad,
+                            query_weight_grad, nullptr, use_addto);
+  return query_grad;
+}
+
+template <typename T>
+Tensor *ComputeGatingLinearForward(const framework::ExecutionContext &ctx,
+                                   const GateAttentionConfig<T> &config,
+                                   const Tensor *query,
+                                   const Tensor *fmha_out) {
+  auto *gate_weight = ctx.Input<Tensor>("GateWeight");
+  auto *gate_bias = ctx.Input<Tensor>("GateBias");
+
+  auto *gate_out = ctx.Output<Tensor>("GateOut");
+  gate_out->mutable_data<T>(ctx.GetPlace());
+  VLOG(4) << "[ComputeGatingLinearForward] gate_out: "
+          << MemoryDebugString(*gate_out);
+
+  // The first gate_bias_out stores the result of the multiplication,
+  // and the second gate_bias_out stores the result of the multiplication +
+  // bias.
+  //   gate_out = GEMM(query, gate_weight) + gate_bias
+  int m = config.batch_size * config.seq_len_m * config.seq_len_r;
+  int n = config.num_heads * config.key_dim;
+  int k = config.q_dim;
+  auto gate_attn_compute =
+      AttnMatMul<T>(ctx.cuda_device_context(), false, false, m, n, k, true);
+  gate_attn_compute.ComputeForward(gate_weight, query, gate_bias, gate_out,
+                                   gate_out);
+
+  // gate_out = sigmoid(gate_out) * fmha_out
+  std::vector<const Tensor *> ins = {gate_out, fmha_out};
+  std::vector<Tensor *> outs = {gate_out};
+  phi::funcs::ElementwiseKernel<T>(ctx.cuda_device_context(), ins, &outs,
+                                   SigmoidMultiplyFunctor<T>());
+  return gate_out;
+}
+
+template <typename T>
+Tensor *ComputeGatingLinearBackward(const framework::ExecutionContext &ctx,
+                                    const GateAttentionGradConfig<T> &config,
+                                    const Tensor *fmha_out,
+                                    const Tensor *gate_out_grad,
+                                    Tensor *query_grad, Tensor *fmha_out_grad) {
+  const auto *query = ctx.Input<Tensor>("Query");
+  const auto *gate_weight = ctx.Input<Tensor>("GateWeight");
+  const auto *gate_bias = ctx.Input<Tensor>("GateBias");
+
+  // Re-compute gate_bias_out
+  Tensor gate_bias_out;
+  gate_bias_out.Resize(config.gate_out_dims);
+  gate_bias_out.mutable_data<T>(ctx.GetPlace());
+
+  int m = config.batch_size * config.seq_len_m * config.seq_len_r;
+  int n = config.num_heads * config.key_dim;
+  int k = config.q_dim;
+  auto gate_attn_compute =
+      AttnMatMul<T>(ctx.cuda_device_context(), false, false, m, n, k, true);
+  gate_attn_compute.ComputeForward(gate_weight, query, gate_bias,
+                                   &gate_bias_out, &gate_bias_out);
+
+  // Gradient of sigmoid(gate_bias_out) * fmha_out
+  // Compute inplace and save gate_bias_out_grad to gate_bias_out.
+  std::vector<const Tensor *> ins = {gate_out_grad, &gate_bias_out, fmha_out};
+  std::vector<Tensor *> outs = {&gate_bias_out, fmha_out_grad};
+  phi::funcs::ElementwiseKernel<T, SigmoidMultiplyGradFunctor<T>, 2>(
+      ctx.cuda_device_context(), ins, &outs, SigmoidMultiplyGradFunctor<T>());
+
+  // Gradient of GEMM(query, gate_weight) + gate_bias
+  auto *gate_weight_grad =
+      ctx.Output<Tensor>(framework::GradVarName("GateWeight"));
+  auto *gate_bias_grad = ctx.Output<Tensor>(framework::GradVarName("GateBias"));
+  gate_weight_grad->mutable_data<T>(ctx.GetPlace());
+  gate_bias_grad->mutable_data<T>(ctx.GetPlace());
+
+  gate_attn_compute.ComputeBackward(query, gate_weight, &gate_bias_out,
+                                    query_grad, gate_weight_grad,
+                                    gate_bias_grad);
+  return fmha_out_grad;
+}
+
+template <typename T>
+Tensor *ComputeOutputLinearForward(const framework::ExecutionContext &ctx,
+                                   const GateAttentionConfig<T> &config,
+                                   const Tensor *fmha_or_gate_out) {
+  const auto *out_linear_weight = ctx.Input<Tensor>("OutLinearWeight");
+  const auto *out_linear_bias = ctx.Input<Tensor>("OutLinearBias");
+
+  auto *out = ctx.Output<Tensor>("Out");
+  out->mutable_data<T>(ctx.GetPlace());
+  VLOG(4) << "[ComputeOutputLinearForward] out: " << MemoryDebugString(*out);
+
+  // out = GEMM(fmha_or_gate_out, out_linear_weight) + out_linear_bias
+  int m = config.batch_size * config.seq_len_m * config.seq_len_r;
+  int n = config.q_dim;
+  int k = config.num_heads * config.key_dim;
+  auto out_linear_compute =
+      AttnMatMul<T>(ctx.cuda_device_context(), false, false, m, n, k, true);
+  out_linear_compute.ComputeForward(out_linear_weight, fmha_or_gate_out,
+                                    out_linear_bias, out, out);
+  return out;
+}
+
+template <typename T>
+Tensor *ComputeOutputLinearBackward(const framework::ExecutionContext &ctx,
+                                    const GateAttentionGradConfig<T> &config,
+                                    bool has_gating) {
+  std::string input_name = has_gating ? "GateOut" : "FMHAOut";
+
+  const auto *out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+  const auto *out_linear_weight = ctx.Input<Tensor>("OutLinearWeight");
+  const auto *input = ctx.Input<Tensor>(input_name);
+
+  auto *out_linear_weight_grad =
+      ctx.Output<Tensor>(framework::GradVarName("OutLinearWeight"));
+  auto *out_linear_bias_grad =
+      ctx.Output<Tensor>(framework::GradVarName("OutLinearBias"));
+  auto *input_grad = ctx.Output<Tensor>(framework::GradVarName(input_name));
+
+  out_linear_weight_grad->mutable_data<T>(ctx.GetPlace());
+  out_linear_bias_grad->mutable_data<T>(ctx.GetPlace());
+  input_grad->mutable_data<T>(ctx.GetPlace());
+
+  int m = config.batch_size * config.seq_len_m * config.seq_len_r;
+  int n = config.q_dim;
+  int k = config.num_heads * config.key_dim;
+  auto out_linear_compute =
+      AttnMatMul<T>(ctx.cuda_device_context(), false, false, m, n, k, true);
+  out_linear_compute.ComputeBackward(input, out_linear_weight, out_grad,
+                                     input_grad, out_linear_weight_grad,
+                                     out_linear_bias_grad);
+  return input_grad;
+}
+
+template <typename T>
+class FusedGateAttentionOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *query = ctx.Input<Tensor>("Query");
+    const auto *key = ctx.Input<Tensor>("Key");
+    const auto *query_weight = ctx.Input<Tensor>("QueryWeight");
+    const auto *qkv_weight = ctx.Input<Tensor>("QKVWeight");
+
+    const auto *src_mask = ctx.Input<Tensor>("SrcMask");
+    const auto *nonbatched_bias = ctx.Input<Tensor>("NonbatchedBias");
+
+    auto *q_transpose_out = ctx.Output<Tensor>("QueryTransposeOut");
+    auto *k_transpose_out = ctx.Output<Tensor>("KeyTransposeOut");
+    auto *v_transpose_out = ctx.Output<Tensor>("ValueTransposeOut");
+    auto *qkv_transpose_out = ctx.Output<Tensor>("QKVTransposeOut");
+
+    auto *softmax_out = ctx.Output<Tensor>("SoftmaxOut");
+    auto *fmha_out = ctx.Output<Tensor>("FMHAOut");
+
+    const bool merge_qkv = ctx.Attr<bool>("merge_qkv");
+    const bool has_gating = ctx.Attr<bool>("has_gating");
+
+    // When seq_len_r = m_size, q_dim = kv_dim, QKV matmul can be merged.
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    GateAttentionConfig<T> config(query, key, query_weight, qkv_weight,
+                                  merge_qkv);
+
+    if (merge_qkv) {
+      // 1. Merged QKV Matmul: einsum(nbhqk,nbkhc -> nbqhc)
+      Tensor *qkv_out = config.GetQKVOut(dev_ctx);
+      ComputeMergedQKVMatmulForward<T>(ctx, config, query, qkv_out);
+
+      qkv_transpose_out->mutable_data<T>(ctx.GetPlace());
+      VLOG(4) << "qkv_transpose_out:" << MemoryDebugString(*qkv_transpose_out);
+    } else {
+      // 1. Separated QKV Matmul
+      Tensor *query_out = config.GetQueryOut(dev_ctx);
+      Tensor *key_out = config.GetKeyOut(dev_ctx);
+      Tensor *value_out = config.GetValueOut(dev_ctx);
+      ComputeSeparatedQKVMatmulForward<T>(ctx, config, query, key, query_out,
+                                          key_out, value_out);
+
+      q_transpose_out->mutable_data<T>(ctx.GetPlace());
+      k_transpose_out->mutable_data<T>(ctx.GetPlace());
+      v_transpose_out->mutable_data<T>(ctx.GetPlace());
+      VLOG(4) << "q_transpose_out: " << MemoryDebugString(*q_transpose_out);
+      VLOG(4) << "k_transpose_out: " << MemoryDebugString(*k_transpose_out);
+      VLOG(4) << "v_transpose_out: " << MemoryDebugString(*v_transpose_out);
+    }
+
+    softmax_out->mutable_data<T>(ctx.GetPlace());
+    fmha_out->mutable_data<T>(ctx.GetPlace());
+    VLOG(4) << "softmax_out: " << MemoryDebugString(*softmax_out);
+    VLOG(4) << "fmha_out: " << MemoryDebugString(*fmha_out);
+
+    // 2. FMHA
+    auto fmha_compute = FMHAGateRef<T>(dev_ctx, merge_qkv);
+    fmha_compute.ComputeForward(
+        nonbatched_bias, src_mask, q_transpose_out, k_transpose_out,
+        v_transpose_out, qkv_transpose_out, softmax_out, fmha_out, &config);
+
+    // 3. Gating Linear
+    Tensor *fmha_or_gate_out =
+        !has_gating ? fmha_out : ComputeGatingLinearForward<T>(ctx, config,
+                                                               query, fmha_out);
+
+    // 4. Output Linear
+    ComputeOutputLinearForward<T>(ctx, config, fmha_or_gate_out);
+  }
+};
+
+template <typename T>
+class FusedGateAttentionGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto has_gating = ctx.Attr<bool>("has_gating");
+    const auto merge_qkv = ctx.Attr<bool>("merge_qkv");
+
+    // forward input
+    const auto *query = ctx.Input<Tensor>("Query");
+    const auto *key = ctx.Input<Tensor>("Key");
+    const auto *query_weight = ctx.Input<Tensor>("QueryWeight");
+    const auto *qkv_weight = ctx.Input<Tensor>("QKVWeight");
+
+    // forward output, backward input
+    const auto *q_transpose_out = ctx.Input<Tensor>("QueryTransposeOut");
+    const auto *k_transpose_out = ctx.Input<Tensor>("KeyTransposeOut");
+    const auto *v_transpose_out = ctx.Input<Tensor>("ValueTransposeOut");
+    const auto *qkv_transpose_out = ctx.Input<Tensor>("QKVTransposeOut");
+    const auto *softmax_out = ctx.Input<Tensor>("SoftmaxOut");
+    const auto *fmha_out = ctx.Input<Tensor>("FMHAOut");
+
+    // backward output
+    auto *query_grad = ctx.Output<Tensor>(framework::GradVarName("Query"));
+    query_grad->mutable_data<T>(ctx.GetPlace());
+    auto *nonbatched_bias_grad =
+        ctx.Output<Tensor>(framework::GradVarName("NonbatchedBias"));
+    auto *fmha_out_grad = ctx.Output<Tensor>(framework::GradVarName("FMHAOut"));
+
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    GateAttentionGradConfig<T> config(query, key, query_weight, qkv_weight,
+                                      merge_qkv);
+
+    // 1. Gradient of Output Linear
+    Tensor *fhma_or_gate_out_grad =
+        ComputeOutputLinearBackward<T>(ctx, config, has_gating);
+
+    // 2. Gradient of Gating Linear
+    if (has_gating) {
+      // fhma_or_gate_out_grad is actually gate_out_grad.
+      fmha_out_grad->mutable_data<T>(ctx.GetPlace());
+      ComputeGatingLinearBackward<T>(ctx, config, fmha_out,
+                                     fhma_or_gate_out_grad, query_grad,
+                                     fmha_out_grad);
+    }
+
+    // 3. Gradient of FMHA
+    if (nonbatched_bias_grad) {
+      nonbatched_bias_grad->mutable_data<T>(ctx.GetPlace());
+    }
+
+    auto fmha_compute = FMHAGateRef<T>(dev_ctx, merge_qkv);
+    fmha_compute.ComputeBackward(
+        q_transpose_out, k_transpose_out, v_transpose_out, qkv_transpose_out,
+        softmax_out, fmha_out_grad, nullptr, nonbatched_bias_grad, &config);
+
+    bool use_addto = has_gating ? true : false;
+    if (merge_qkv) {
+      // 4. Gradient of Merged QKV Matmul
+      Tensor *qkv_out_grad = config.GetQKVOutGrad(dev_ctx);
+      ComputeMergedQKVMatmulBackward<T>(ctx, config, query, qkv_out_grad,
+                                        query_grad, use_addto);
+    } else {
+      // 4. Gradient of Separated QKV Matmul
+      auto *key_grad = ctx.Output<Tensor>(framework::GradVarName("Key"));
+      if (key_grad) {
+        key_grad->mutable_data<T>(ctx.GetPlace());
+      }
+      Tensor *query_out_grad = config.GetQueryOutGrad(dev_ctx);
+      Tensor *key_out_grad = config.GetKeyOutGrad(dev_ctx);
+      Tensor *value_out_grad = config.GetValueOutGrad(dev_ctx);
+      ComputeSeparatedQKVMatmulBackward<T>(
+          ctx, config, query, key, query_out_grad, key_out_grad, value_out_grad,
+          query_grad, key_grad, use_addto);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+#ifdef PADDLE_WITH_HIP
+REGISTER_OP_CUDA_KERNEL(fused_gate_attention,
+                        ops::FusedGateAttentionOpKernel<float>,
+                        ops::FusedGateAttentionOpKernel<plat::float16>,
+                        ops::FusedGateAttentionOpKernel<plat::bfloat16>);
+REGISTER_OP_CUDA_KERNEL(fused_gate_attention_grad,
+                        ops::FusedGateAttentionGradKernel<float>,
+                        ops::FusedGateAttentionGradKernel<plat::float16>,
+                        ops::FusedGateAttentionGradKernel<plat::bfloat16>);
+#else
+REGISTER_OP_CUDA_KERNEL(fused_gate_attention,
+                        ops::FusedGateAttentionOpKernel<float>,
+                        ops::FusedGateAttentionOpKernel<double>,
+                        ops::FusedGateAttentionOpKernel<plat::float16>,
+                        ops::FusedGateAttentionOpKernel<plat::bfloat16>);
+REGISTER_OP_CUDA_KERNEL(fused_gate_attention_grad,
+                        ops::FusedGateAttentionGradKernel<float>,
+                        ops::FusedGateAttentionGradKernel<double>,
+                        ops::FusedGateAttentionGradKernel<plat::float16>,
+                        ops::FusedGateAttentionGradKernel<plat::bfloat16>);
+#endif
diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
index 4c4e3661e6d6e..7cb6777e5a79a 100644
--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
@@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/fused/fused_gemm_epilogue_op.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
@@ -208,6 +209,9 @@ class FusedGemmEpilogueGradOp : public framework::OperatorWithKernel {
     auto x_dims = ctx->GetInputDim("X");
     auto y_dims = ctx->GetInputDim("Y");
 
+    auto trans_x = ctx->Attrs().Get<bool>("trans_x");
+    auto trans_y = ctx->Attrs().Get<bool>("trans_y");
+
     PADDLE_ENFORCE_GE(
         dout_dims.size(), 2,
         platform::errors::InvalidArgument(
@@ -242,14 +246,14 @@ class FusedGemmEpilogueGradOp : public framework::OperatorWithKernel {
     auto x_mat_dims = phi::flatten_to_2d(x_dims, x_dims.size() - 1);
 
     PADDLE_ENFORCE_EQ(
-        dout_mat_dims[1], y_dims[1],
+        dout_mat_dims[1], trans_y ? y_dims[0] : y_dims[1],
         platform::errors::InvalidArgument(
             "The last dimension of DOut should be equal with Y's last"
             "dimension. But received DOut[-1] = [%d], Y[1] = [%d].",
             dout_mat_dims[1], y_dims[1]));
 
     PADDLE_ENFORCE_EQ(
-        dout_mat_dims[0], x_mat_dims[0],
+        dout_mat_dims[0], trans_x ? x_mat_dims[1] : x_mat_dims[0],
         platform::errors::InvalidArgument(
             "The first dimension of DOut should be equal with X's first"
             "dimension. But received DOut[0] = [%d], Y[0] = [%d].",
@@ -288,7 +292,7 @@ class FusedGemmEpilogueGradOp : public framework::OperatorWithKernel {
 
     if (ctx->HasOutput("DBias")) {
       std::vector<int64_t> dbias_dims;
-      dbias_dims.push_back(y_dims[1]);
+      dbias_dims.push_back(trans_y ? y_dims[0] : y_dims[1]);
       ctx->SetOutputDim("DBias", phi::make_ddim(dbias_dims));
     }
   }
@@ -323,6 +327,20 @@ class FusedGemmEpilogueGradOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("DBias",
               "The output grad tensor to bias of Out = (Act(X) * Y) + bias.")
         .AsDispensable();
+    AddAttr<bool>(
+        "trans_x",
+        R"DOC((bool, default false), Whether to transpose input tensor X 
+    or not. The input tensor X coulbe be more than two dimension. When 
+    set trans_x=true, it would fully reverse X. For instant: X with shpae 
+    [d0, d1, d2, d3] -> [d3, d2, d1, d0].)DOC")
+        .SetDefault(false);
+    AddAttr<bool>(
+        "trans_y",
+        R"DOC((bool, default false), Whether to transpose input tensor Y 
+    or not. The input tensor Y should be two dimension. When 
+    set trans_y=true, it would transpose Y. For instant: Y with shpae 
+    [d0, d1] -> [d1, d0].)DOC")
+        .SetDefault(false);
 
     AddAttr<std::string>(
         "activation_grad",
@@ -343,11 +361,38 @@ X with shape [d0, d1, d2, d3] -> X_2D with shape [d0*d1*d2, d3]
   }
 };
 
+template <typename T>
+class FusedGemmEpilogueOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    const auto& act_type = this->template Attr<std::string>("activation");
+    PADDLE_ENFORCE_EQ(act_type, "none", phi::errors::InvalidArgument(
+                                            "The activation should be none."));
+
+    op->SetType(this->ForwardOpType() + "_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Y", this->Input("Y"));
+    op->SetInput("DOut", this->OutputGrad("Out"));
+
+    op->SetOutput("DX", this->InputGrad("X"));
+    op->SetOutput("DY", this->InputGrad("Y"));
+    op->SetOutput("DBias", this->InputGrad("Bias"));
+
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(fused_gemm_epilogue, ops::FusedGemmEpilogueOp,
-                  ops::FusedGemmEpilogueOpMaker)
+REGISTER_OPERATOR(
+    fused_gemm_epilogue, ops::FusedGemmEpilogueOp,
+    ops::FusedGemmEpilogueOpMaker,
+    ops::FusedGemmEpilogueOpGradMaker<paddle::framework::OpDesc>,
+    ops::FusedGemmEpilogueOpGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(fused_gemm_epilogue_grad, ops::FusedGemmEpilogueGradOp,
-                  ops::FusedGemmEpilogueGradOpMaker)
+                  ops::FusedGemmEpilogueGradOpMaker);
diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
index 9bf3d1a485efc..407cd2b974def 100644
--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
@@ -15,6 +15,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/framework/scope_guard.h"
 #include "paddle/fluid/operators/fused/fused_gemm_epilogue_op.h"
 #include "paddle/fluid/platform/dynload/cublasLt.h"
 #include "paddle/fluid/platform/float16.h"
@@ -41,6 +42,8 @@ class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
     bool trans_y = ctx.Attr<bool>("trans_y");
 
     std::string activation = ctx.Attr<std::string>("activation");
+    VLOG(10) << "trans_x = " << trans_x << " , trans_y = " << trans_y
+             << " , activation = " << activation;
     bool enable_auxiliary = reserve_space == nullptr ? false : true;
 
     out->mutable_data<T>(ctx.GetPlace());
@@ -48,6 +51,7 @@ class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
 
     auto x_mat_dims =
         phi::flatten_to_2d(x->dims(), trans_x ? 1 : x->dims().size() - 1);
+    // (M * K) * (K * N)
     int64_t M = trans_x ? x_mat_dims[1] : x_mat_dims[0];
     int64_t K = trans_y ? y->dims()[1] : y->dims()[0];
     int64_t N = trans_y ? y->dims()[0] : y->dims()[1];
@@ -106,10 +110,11 @@ class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
           platform::dynload::cublasLtMatmulDescSetAttribute(
               operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER,
               &aux_data, sizeof(aux_data)));
+      int64_t aux_ld = N;
       PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cublasLtMatmulDescSetAttribute(
-              operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &N,
-              sizeof(N)));
+              operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &aux_ld,
+              sizeof(aux_ld)));
     }
 
     cublasLtMatrixLayout_t x_desc = NULL, y_desc = NULL, out_desc = NULL;
@@ -129,8 +134,7 @@ class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
         &out_desc, mat_type, N, M, N));
 
     cublasLtHandle_t lt_handle = dev_ctx.cublaslt_handle();
-    size_t workspace_size = 4 * 1024 * 1024;
-
+    size_t workspace_size = static_cast<size_t>(4) * 1024 * 1024 * 1024;
     cudaStream_t stream = dev_ctx.stream();
     memory::allocation::AllocationPtr workspace =
         memory::Alloc(dev_ctx, workspace_size);
@@ -149,13 +153,13 @@ class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
     const auto* y_data = y->data<T>();
     const auto* x_data = x->data<T>();
 
-    cublasLtMatmulAlgo_t algo = GemmEpilogueAlgoCache::Instance().GetGemmAlgo(
+    auto algo = GemmEpilogueAlgoCache::Instance().GetGemmAlgo(
         lt_handle, operation_desc, y_desc, x_desc, out_desc, alpha, beta,
         y_data, x_data, out_data, stream, workspace->ptr(), workspace_size);
 
     PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmul(
         lt_handle, operation_desc, alpha, y_data, y_desc, x_data, x_desc, beta,
-        out_data, out_desc, out_data, out_desc, &algo, workspace->ptr(),
+        out_data, out_desc, out_data, out_desc, algo, workspace->ptr(),
         workspace_size, stream));
 
     PADDLE_ENFORCE_GPU_SUCCESS(
@@ -191,12 +195,94 @@ class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
   }
 };
 
+enum FusedGEMMGradInType { kDX = 0, kDY = 1, kDZ = 2 };
+
+template <bool TransX, bool TransY>
+struct FusedGEMMGradTrait;
+
+template <>
+struct FusedGEMMGradTrait<false, false> {
+  static constexpr auto kXGradA = FusedGEMMGradInType::kDZ;
+  static constexpr auto kXGradB = FusedGEMMGradInType::kDY;
+  static constexpr auto kXGradATrans = false;
+  static constexpr auto kXGradBTrans = true;
+
+  static constexpr auto kYGradA = FusedGEMMGradInType::kDX;
+  static constexpr auto kYGradB = FusedGEMMGradInType::kDZ;
+  static constexpr auto kYGradATrans = true;
+  static constexpr auto kYGradBTrans = false;
+};
+
+template <>
+struct FusedGEMMGradTrait<true, false> {
+  static constexpr auto kXGradA = FusedGEMMGradInType::kDY;
+  static constexpr auto kXGradB = FusedGEMMGradInType::kDZ;
+  static constexpr auto kXGradATrans = false;
+  static constexpr auto kXGradBTrans = true;
+
+  static constexpr auto kYGradA = FusedGEMMGradInType::kDX;
+  static constexpr auto kYGradB = FusedGEMMGradInType::kDZ;
+  static constexpr auto kYGradATrans = false;
+  static constexpr auto kYGradBTrans = false;
+};
+
+template <>
+struct FusedGEMMGradTrait<false, true> {
+  static constexpr auto kXGradA = FusedGEMMGradInType::kDZ;
+  static constexpr auto kXGradB = FusedGEMMGradInType::kDY;
+  static constexpr auto kXGradATrans = false;
+  static constexpr auto kXGradBTrans = false;
+
+  static constexpr auto kYGradA = FusedGEMMGradInType::kDZ;
+  static constexpr auto kYGradB = FusedGEMMGradInType::kDX;
+  static constexpr auto kYGradATrans = true;
+  static constexpr auto kYGradBTrans = false;
+};
+
+template <>
+struct FusedGEMMGradTrait<true, true> {
+  static constexpr auto kXGradA = FusedGEMMGradInType::kDY;
+  static constexpr auto kXGradB = FusedGEMMGradInType::kDZ;
+  static constexpr auto kXGradATrans = true;
+  static constexpr auto kXGradBTrans = true;
+
+  static constexpr auto kYGradA = FusedGEMMGradInType::kDZ;
+  static constexpr auto kYGradB = FusedGEMMGradInType::kDX;
+  static constexpr auto kYGradATrans = true;
+  static constexpr auto kYGradBTrans = true;
+};
+
+static constexpr auto BoolToCuBlasEnum(bool transpose) {
+  return transpose ? CUBLAS_OP_T : CUBLAS_OP_N;
+}
+
 template <typename DeviceContext, typename T>
 class FusedGemmEpilogueGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    bool transpose_x = ctx.Attr<bool>("trans_x");
+    bool transpose_y = ctx.Attr<bool>("trans_y");
 
+    if (transpose_x) {
+      if (transpose_y) {
+        ComputeImpl<true, true>(ctx);
+      } else {
+        ComputeImpl<true, false>(ctx);
+      }
+    } else {
+      if (transpose_y) {
+        ComputeImpl<false, true>(ctx);
+      } else {
+        ComputeImpl<false, false>(ctx);
+      }
+    }
+  }
+
+ private:
+  template <bool TransX, bool TransY>
+  static void ComputeImpl(const framework::ExecutionContext& ctx) {
+    using Trait = FusedGEMMGradTrait<TransX, TransY>;
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     const Tensor* dout = ctx.Input<Tensor>("DOut");
     const Tensor* x = ctx.Input<Tensor>("X");
     const Tensor* y = ctx.Input<Tensor>("Y");
@@ -208,13 +294,18 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel<T> {
 
     std::string activation_grad = ctx.Attr<std::string>("activation_grad");
 
-    auto dout_mat_dims =
-        phi::flatten_to_2d(dout->dims(), dout->dims().size() - 1);
-    auto x_mat_dims = phi::flatten_to_2d(x->dims(), x->dims().size() - 1);
+    VLOG(10) << "trans_x = " << TransX << " , trans_y = " << TransY
+             << " , activation_grad = " << activation_grad;
+
+    auto x_mat_dims =
+        phi::flatten_to_2d(x->dims(), TransX ? 1 : x->dims().size() - 1);
+
+    // (M * K) * (K * N)
+    int64_t M = TransX ? x_mat_dims[1] : x_mat_dims[0];
+    int64_t K = TransY ? y->dims()[1] : y->dims()[0];
+    int64_t N = TransY ? y->dims()[0] : y->dims()[1];
 
-    int64_t M = x_mat_dims[0];
-    int64_t K = y->dims()[0];
-    int64_t N = y->dims()[1];
+    VLOG(10) << "M = " << M << " , K = " << K << " , N = " << N;
 
     cudaDataType_t mat_type = CUDA_R_32F;
     cudaDataType_t scale_type = CUDA_R_32F;
@@ -229,7 +320,8 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel<T> {
     }
 
     cublasLtHandle_t lt_handle = dev_ctx.cublaslt_handle();
-    size_t workspace_size = 4 * 1024 * 1024;
+    size_t workspace_size = static_cast<size_t>(4) * 1024 * 1024 * 1024;
+    const cublasLtMatmulAlgo_t* algo = nullptr;
     cudaStream_t stream = dev_ctx.stream();
 
     double alpha64 = 1.0, beta64 = 0.0;
@@ -243,24 +335,81 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel<T> {
       beta = &beta32;
     }
 
-    cublasOperation_t trans_dout = CUBLAS_OP_N;
-    cublasLtMatrixLayout_t dout_desc = NULL;
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
-        &dout_desc, mat_type, N, M, N));
+    cublasLtMatrixLayout_t dout_desc = nullptr, dout_trans_desc = nullptr;
+    cublasLtMatrixLayout_t x_desc = nullptr, x_trans_desc = nullptr;
+    cublasLtMatrixLayout_t y_desc = nullptr, y_trans_desc = nullptr;
+    cublasLtMatrixLayout_t dx_desc = nullptr, dy_desc = nullptr;
+    cublasLtMatmulDesc_t dx_operation_desc = nullptr,
+                         dy_operation_desc = nullptr;
+
+    DEFINE_PADDLE_SCOPE_GUARD([&] {
+      auto descs = {dout_desc, dout_trans_desc, x_desc,  x_trans_desc,
+                    y_desc,    y_trans_desc,    dx_desc, dy_desc};
+      for (auto desc : descs) {
+        if (desc) {
+          PADDLE_ENFORCE_GPU_SUCCESS(
+              platform::dynload::cublasLtMatrixLayoutDestroy(desc));
+        }
+      }
 
+      if (dx_operation_desc) {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            platform::dynload::cublasLtMatmulDescDestroy(dx_operation_desc));
+      }
+
+      if (dy_operation_desc) {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            platform::dynload::cublasLtMatmulDescDestroy(dy_operation_desc));
+      }
+    });
+
+    auto x_row = TransX ? K : M;
+    auto x_col = TransX ? M : K;
+    auto y_row = TransY ? N : K;
+    auto y_col = TransY ? K : N;
+    auto z_row = TransX ? N : M;
+    auto z_col = TransX ? M : N;
+
+    // dx = func(dout, y)
     if (dx) {
-      cublasLtMatmulDesc_t dx_operation_desc = NULL;
+      constexpr auto kXGradAIsDZ = (Trait::kXGradA == FusedGEMMGradInType::kDZ);
+      cublasLtMatrixLayout_t *dx_dout_desc, *dx_y_desc;
+
+      if (TransX) {
+        dx_dout_desc = &dout_trans_desc;
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            platform::dynload::cublasLtMatrixLayoutCreate(
+                dx_dout_desc, mat_type, z_row, z_col, z_row));
+      } else {
+        dx_dout_desc = &dout_desc;
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            platform::dynload::cublasLtMatrixLayoutCreate(
+                dx_dout_desc, mat_type, z_col, z_row, z_col));
+      }
+
+      dx_y_desc = &y_trans_desc;
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          dx_y_desc, mat_type, y_col, y_row, y_col));
+
+      auto& a_desc = kXGradAIsDZ ? (*dx_dout_desc) : (*dx_y_desc);
+      auto& b_desc = kXGradAIsDZ ? (*dx_y_desc) : (*dx_dout_desc);
+      auto a_trans = BoolToCuBlasEnum(Trait::kXGradATrans);
+      auto b_trans = BoolToCuBlasEnum(Trait::kXGradBTrans);
+
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          &dx_desc, mat_type, x_col, x_row, x_col));
+
       PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmulDescCreate(
           &dx_operation_desc, compute_type, scale_type));
-      cublasOperation_t trans_y = CUBLAS_OP_T;
       PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cublasLtMatmulDescSetAttribute(
-              dx_operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &trans_dout,
-              sizeof(trans_dout)));
+              dx_operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &a_trans,
+              sizeof(a_trans)));
       PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cublasLtMatmulDescSetAttribute(
-              dx_operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &trans_y,
-              sizeof(trans_y)));
+              dx_operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &b_trans,
+              sizeof(b_trans)));
+
       cublasLtEpilogue_t epiloque_func_for_dx =
           get_epilogue_type_(activation_grad);
       PADDLE_ENFORCE_GPU_SUCCESS(
@@ -274,105 +423,116 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel<T> {
             platform::dynload::cublasLtMatmulDescSetAttribute(
                 dx_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER,
                 &aux_data, sizeof(aux_data)));
+        int64_t aux_ld = TransX ? M : K;
         PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::cublasLtMatmulDescSetAttribute(
-                dx_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &K,
-                sizeof(K)));
+                dx_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD,
+                &aux_ld, sizeof(aux_ld)));
       }
 
-      cublasLtMatrixLayout_t y_desc = NULL, dx_desc = NULL;
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
-          &y_desc, mat_type, N, K, N));
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
-          &dx_desc, mat_type, K, M, K));
-
-      memory::allocation::AllocationPtr dx_workspace =
-          memory::Alloc(dev_ctx, workspace_size);
+      auto dx_workspace = memory::Alloc(dev_ctx, workspace_size);
 
-      dx->mutable_data<T>(ctx.GetPlace());
-      auto* dx_data = dx->data<T>();
+      auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
       const auto* y_data = y->data<T>();
       const auto* dout_data = dout->data<T>();
+      const auto* a_data = kXGradAIsDZ ? dout_data : y_data;
+      const auto* b_data = kXGradAIsDZ ? y_data : dout_data;
 
-      cublasLtMatmulAlgo_t algo = GemmEpilogueAlgoCache::Instance().GetGemmAlgo(
-          lt_handle, dx_operation_desc, y_desc, dout_desc, dx_desc, alpha, beta,
-          y_data, dout_data, dx_data, stream, dx_workspace->ptr(),
-          workspace_size);
+      auto algo = GemmEpilogueAlgoCache::Instance().GetGemmAlgo(
+          lt_handle, dx_operation_desc, b_desc, a_desc, dx_desc, alpha, beta,
+          b_data, a_data, dx_data, stream, dx_workspace->ptr(), workspace_size);
 
       PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmul(
-          lt_handle, dx_operation_desc, alpha, y->data<T>(), y_desc,
-          dout->data<T>(), dout_desc, beta, dx_data, dx_desc, dx_data, dx_desc,
-          &algo, dx_workspace->ptr(), workspace_size, stream));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cublasLtMatmulDescDestroy(dx_operation_desc));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cublasLtMatrixLayoutDestroy(y_desc));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cublasLtMatrixLayoutDestroy(dx_desc));
+          lt_handle, dx_operation_desc, alpha, b_data, b_desc, a_data, a_desc,
+          beta, dx_data, dx_desc, dx_data, dx_desc, algo, dx_workspace->ptr(),
+          workspace_size, stream));
     }
 
+    // dy = func(dout, x)
     if (dy) {
-      cublasLtMatmulDesc_t dy_operation_desc = NULL;
+      constexpr auto kYGradAIsDZ = (Trait::kYGradA == FusedGEMMGradInType::kDZ);
+
+      cublasLtMatrixLayout_t *dy_dout_desc = nullptr, *dy_x_desc = nullptr;
+      if (TransX) {
+        dy_dout_desc = &dout_trans_desc;
+        if (dout_trans_desc == nullptr) {
+          PADDLE_ENFORCE_GPU_SUCCESS(
+              platform::dynload::cublasLtMatrixLayoutCreate(
+                  dy_dout_desc, mat_type, z_row, z_col, z_row));
+        }
+      } else {
+        dy_dout_desc = &dout_desc;
+        if (dout_desc == nullptr) {
+          PADDLE_ENFORCE_GPU_SUCCESS(
+              platform::dynload::cublasLtMatrixLayoutCreate(
+                  dy_dout_desc, mat_type, z_col, z_row, z_col));
+        }
+      }
+
+      dy_x_desc = &x_trans_desc;
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          dy_x_desc, mat_type, x_col, x_row, x_col));
+
+      auto& a_desc = kYGradAIsDZ ? (*dy_dout_desc) : (*dy_x_desc);
+      auto& b_desc = kYGradAIsDZ ? (*dy_x_desc) : (*dy_dout_desc);
+      auto a_trans = BoolToCuBlasEnum(Trait::kYGradATrans);
+      auto b_trans = BoolToCuBlasEnum(Trait::kYGradBTrans);
+
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          &dy_desc, mat_type, y_col, y_row, y_col));
+
       PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmulDescCreate(
           &dy_operation_desc, compute_type, scale_type));
-      cublasOperation_t trans_x = CUBLAS_OP_T;
+
       PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cublasLtMatmulDescSetAttribute(
-              dy_operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &trans_dout,
-              sizeof(trans_dout)));
+              dy_operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &a_trans,
+              sizeof(a_trans)));
       PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cublasLtMatmulDescSetAttribute(
-              dy_operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &trans_x,
-              sizeof(trans_x)));
-      cublasLtEpilogue_t epiloque_func_for_dy = dbias == nullptr
-                                                    ? CUBLASLT_EPILOGUE_DEFAULT
-                                                    : CUBLASLT_EPILOGUE_BGRADA;
+              dy_operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &b_trans,
+              sizeof(b_trans)));
+
+      cublasLtEpilogue_t epiloque_func_for_dy;
+      if (dbias == nullptr) {
+        epiloque_func_for_dy = CUBLASLT_EPILOGUE_DEFAULT;
+      } else {
+        if (TransY) {
+          epiloque_func_for_dy = CUBLASLT_EPILOGUE_BGRADB;
+        } else {
+          epiloque_func_for_dy = CUBLASLT_EPILOGUE_BGRADA;
+        }
+      }
+
       PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cublasLtMatmulDescSetAttribute(
               dy_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE,
               &epiloque_func_for_dy, sizeof(epiloque_func_for_dy)));
 
       if (dbias) {
-        dbias->mutable_data<T>(ctx.GetPlace());
-        auto* dbias_data = dbias->data<T>();
+        auto* dbias_data = dbias->mutable_data<T>(ctx.GetPlace());
         PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::cublasLtMatmulDescSetAttribute(
                 dy_operation_desc, CUBLASLT_MATMUL_DESC_BIAS_POINTER,
                 &dbias_data, sizeof(dbias_data)));
       }
 
-      cublasLtMatrixLayout_t x_desc = NULL, dy_desc = NULL;
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
-          &x_desc, mat_type, K, M, K));
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
-          &dy_desc, mat_type, N, K, N));
-
-      memory::allocation::AllocationPtr dy_workspace =
-          memory::Alloc(dev_ctx, workspace_size);
-
-      dy->mutable_data<T>(ctx.GetPlace());
-      auto* dy_data = dy->data<T>();
+      auto dy_workspace = memory::Alloc(dev_ctx, workspace_size);
+      auto* dy_data = dy->mutable_data<T>(ctx.GetPlace());
       const auto* dout_data = dout->data<T>();
       const auto* x_data = x->data<T>();
+      const auto* a_data = kYGradAIsDZ ? dout_data : x_data;
+      const auto* b_data = kYGradAIsDZ ? x_data : dout_data;
 
-      cublasLtMatmulAlgo_t algo = GemmEpilogueAlgoCache::Instance().GetGemmAlgo(
-          lt_handle, dy_operation_desc, dout_desc, x_desc, dy_desc, alpha, beta,
-          dout_data, x_data, dy_data, stream, dy_workspace->ptr(),
-          workspace_size);
+      auto algo = GemmEpilogueAlgoCache::Instance().GetGemmAlgo(
+          lt_handle, dy_operation_desc, b_desc, a_desc, dy_desc, alpha, beta,
+          b_data, a_data, dy_data, stream, dy_workspace->ptr(), workspace_size);
 
       PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmul(
-          lt_handle, dy_operation_desc, alpha, dout_data, dout_desc, x_data,
-          x_desc, beta, dy_data, dy_desc, dy_data, dy_desc, &algo,
-          dy_workspace->ptr(), workspace_size, stream));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cublasLtMatmulDescDestroy(dy_operation_desc));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cublasLtMatrixLayoutDestroy(x_desc));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cublasLtMatrixLayoutDestroy(dy_desc));
+          lt_handle, dy_operation_desc, alpha, b_data, b_desc, a_data, a_desc,
+          beta, dy_data, dy_desc, dy_data, dy_desc, algo, dy_workspace->ptr(),
+          workspace_size, stream));
     }
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cublasLtMatrixLayoutDestroy(dout_desc));
   }
 
  private:
diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.h b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.h
index c90a6966fe0a8..8ff41b2c9616b 100644
--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.h
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.h
@@ -21,7 +21,9 @@ limitations under the License. */
 #include <unordered_map>
 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/dynload/cublasLt.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/utils/optional.h"
 
 DECLARE_int64(cublaslt_exhaustive_search_times);
 
@@ -39,12 +41,14 @@ class GemmEpilogueAlgoCache {
   GemmEpilogueAlgoCache(GemmEpilogueAlgoCache const &) = delete;
   void operator=(GemmEpilogueAlgoCache const &) = delete;
 
-  cublasLtMatmulAlgo_t GetGemmAlgo(
+  cublasLtMatmulAlgo_t *GetGemmAlgo(
       cublasLtHandle_t lt_handle, cublasLtMatmulDesc_t op_desc,
       cublasLtMatrixLayout_t a_desc, cublasLtMatrixLayout_t b_desc,
       cublasLtMatrixLayout_t c_desc, const void *alpha, const void *beta,
       const void *a, const void *b, void *c, cudaStream_t stream,
       void *workspace, size_t workspace_size) {
+    if (search_times_ <= 0) return nullptr;
+
     int64_t seed = 0;
     std::hash<int64_t> hash_fn;
 
@@ -54,132 +58,108 @@ class GemmEpilogueAlgoCache {
     HashMatrixLayoutDesc_(c_desc, &seed, hash_fn);
 
     cublasLtMatmulAlgo_t ret;
-    auto it = map_.end();
-    bool have_found = false;
     {
       std::lock_guard<std::mutex> lock(cache_mutex_);
-      it = map_.find(seed);
-
+      auto it = map_.find(seed);
       if (it != map_.end()) {
-        ret = it->second;
-        have_found = true;
+        return &(it->second);
       }
     }
 
-    if (!have_found) {
-      cublasLtMatmulPreference_t preference;
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cublasLtMatmulPreferenceCreate(&preference));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cublasLtMatmulPreferenceSetAttribute(
-              preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
-              &workspace_size, sizeof(workspace_size)));
-
-      int returned_results = 0;
-      cublasLtMatmulHeuristicResult_t heuristic_results[requested_algo_count_] =
-          {0};
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cublasLtMatmulAlgoGetHeuristic(
-              lt_handle, op_desc, a_desc, b_desc, c_desc, c_desc, preference,
-              requested_algo_count_, heuristic_results, &returned_results));
-
-      PADDLE_ENFORCE_GT(
-          returned_results, 0,
-          platform::errors::Unavailable("No GEMM epilogue algorithm support!"));
-
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cublasLtMatmulPreferenceDestroy(preference));
-
-      if (search_times_ > 0) {
-        int best_algo_idx = -1;
-        float best_algo_time = 0;
-
-        // Run 100 times for warmup
-        int warmup_algo_idx = 0;
-        for (int t = 0; t < 100; t++) {
-          cublasStatus_t status = platform::dynload::cublasLtMatmul(
-              lt_handle, op_desc, alpha, a, a_desc, b, b_desc, beta, c, c_desc,
-              c, c_desc, &heuristic_results[warmup_algo_idx].algo, workspace,
-              workspace_size, stream);
-          if (status != CUBLAS_STATUS_SUCCESS) {
-            t = -1;
-            warmup_algo_idx += 1;
-            if (warmup_algo_idx == requested_algo_count_) {
-              PADDLE_THROW(platform::errors::Unavailable(
-                  "No GEMM epilogue algorithm support!"));
-            }
-          }
-        }
+    cublasLtMatmulPreference_t preference;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::cublasLtMatmulPreferenceCreate(&preference));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::cublasLtMatmulPreferenceSetAttribute(
+            preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
+            &workspace_size, sizeof(workspace_size)));
 
-        cudaEvent_t start_event, stop_event;
-        PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreate(&start_event));
-        PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreate(&stop_event));
-
-        for (int algo_idx = 0; algo_idx < returned_results; ++algo_idx) {
-          float curr_time = 0;
-          for (int check_idx = 0; check_idx < search_times_; check_idx++) {
-            float time = 0;
-            PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(start_event, stream));
-
-            cublasStatus_t status = platform::dynload::cublasLtMatmul(
-                lt_handle, op_desc, alpha, a, a_desc, b, b_desc, beta, c,
-                c_desc, c, c_desc, &heuristic_results[algo_idx].algo, workspace,
-                workspace_size, stream);
-
-            PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(stop_event, stream));
-            PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(stop_event));
-            PADDLE_ENFORCE_GPU_SUCCESS(
-                cudaEventElapsedTime(&time, start_event, stop_event));
-            curr_time += time;
-            if (status != CUBLAS_STATUS_SUCCESS) {
-              curr_time = 3.40282e+038;  // Max Value of float
-              break;
-            }
-          }
-
-          curr_time = curr_time / search_times_;
-          if (curr_time < best_algo_time || algo_idx == 0) {
-            best_algo_idx = algo_idx;
-            best_algo_time = curr_time;
-          }
-        }
+    int returned_results = 0;
+    std::vector<cublasLtMatmulHeuristicResult_t> heuristic_results(
+        requested_algo_count_);
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::cublasLtMatmulAlgoGetHeuristic(
+            lt_handle, op_desc, a_desc, b_desc, c_desc, c_desc, preference,
+            requested_algo_count_, heuristic_results.data(),
+            &returned_results));
 
-        PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(start_event));
-        PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(stop_event));
+    PADDLE_ENFORCE_GT(
+        returned_results, 0,
+        platform::errors::Unavailable("No GEMM epilogue algorithm support!"));
 
-        if (best_algo_idx == -1) {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::cublasLtMatmulPreferenceDestroy(preference));
+
+    int best_algo_idx = -1;
+    float best_algo_time = 0;
+
+    // Run 100 times for warmup
+    int warmup_algo_idx = 0;
+    for (int t = 0; t < 100; t++) {
+      cublasStatus_t status = platform::dynload::cublasLtMatmul(
+          lt_handle, op_desc, alpha, a, a_desc, b, b_desc, beta, c, c_desc, c,
+          c_desc, &heuristic_results[warmup_algo_idx].algo, workspace,
+          workspace_size, stream);
+      if (status != CUBLAS_STATUS_SUCCESS) {
+        t = -1;
+        warmup_algo_idx += 1;
+        if (warmup_algo_idx == requested_algo_count_) {
           PADDLE_THROW(platform::errors::Unavailable(
               "No GEMM epilogue algorithm support!"));
         }
+      }
+    }
 
-        ret = heuristic_results[best_algo_idx].algo;
-      } else {
-        int decided_algo_idx = -1;
-        for (int algo_idx = 0; algo_idx < returned_results; ++algo_idx) {
-          cublasStatus_t status = platform::dynload::cublasLtMatmul(
-              lt_handle, op_desc, alpha, a, a_desc, b, b_desc, beta, c, c_desc,
-              c, c_desc, &heuristic_results[algo_idx].algo, workspace,
-              workspace_size, stream);
-          if (status == CUBLAS_STATUS_SUCCESS) {
-            decided_algo_idx = algo_idx;
-            break;
-          }
-        }
-        if (decided_algo_idx == -1) {
-          PADDLE_THROW(platform::errors::Unavailable(
-              "No GEMM epilogue algorithm support!"));
+    cudaEvent_t start_event, stop_event;
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreate(&start_event));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreate(&stop_event));
+
+    for (int algo_idx = 0; algo_idx < returned_results; ++algo_idx) {
+      float curr_time = 0;
+      for (int check_idx = 0; check_idx < search_times_; check_idx++) {
+        float time = 0;
+        PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(start_event, stream));
+
+        cublasStatus_t status = platform::dynload::cublasLtMatmul(
+            lt_handle, op_desc, alpha, a, a_desc, b, b_desc, beta, c, c_desc, c,
+            c_desc, &heuristic_results[algo_idx].algo, workspace,
+            workspace_size, stream);
+
+        PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(stop_event, stream));
+        PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(stop_event));
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            cudaEventElapsedTime(&time, start_event, stop_event));
+        curr_time += time;
+        if (status != CUBLAS_STATUS_SUCCESS) {
+          curr_time = 3.40282e+038;  // Max Value of float
+          break;
         }
-        ret = heuristic_results[decided_algo_idx].algo;
       }
 
-      std::lock_guard<std::mutex> lock(cache_mutex_);
-      map_[seed] = ret;
+      curr_time = curr_time / search_times_;
+      if (curr_time < best_algo_time || algo_idx == 0) {
+        best_algo_idx = algo_idx;
+        best_algo_time = curr_time;
+      }
     }
 
-    VLOG(4) << "Search time:" << search_times_ << ", Is hash-key (" << seed
-            << ") found in GemmEpilogueAlgoCache? " << have_found;
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(start_event));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(stop_event));
+
+    if (best_algo_idx == -1) {
+      PADDLE_THROW(
+          platform::errors::Unavailable("No GEMM epilogue algorithm support!"));
+    }
+
+    ret = heuristic_results[best_algo_idx].algo;
+
+    VLOG(4) << "Search time:" << search_times_ << ", hash-key (" << seed
+            << ") not found in GemmEpilogueAlgoCache";
 
-    return ret;
+    std::lock_guard<std::mutex> lock(cache_mutex_);
+    auto &algo_in_map = map_[seed];
+    algo_in_map = ret;
+    return &algo_in_map;
   }
 
  private:
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cc b/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
index c95ca6fe0c96c..98602e4edd0a2 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
@@ -221,7 +221,7 @@ class FusedMultiTransformerOpOpMaker
                                 "'dropout_rate' must be between 0.0 and 1.0."));
         });
 
-    AddAttr<bool>("dropout_is_test",
+    AddAttr<bool>("is_test",
                   "(bool, default false) Set to true for inference only, false "
                   "for training. Some layers may run faster when this is true.")
         .SetDefault(false);
diff --git a/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
index 05d6bae5f719a..91dccbee0aef2 100644
--- a/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
@@ -45,19 +45,17 @@ class ExpandMKLDNNKernel : public paddle::framework::OpKernel<T> {
       out_new_dims[i] = out_new_dims[i] > 0 ? out_new_dims[i] : x_vec_dims[i];
     }
 
-    dnnl::memory::desc x_mem_desc = x->mem_desc();
     if (x_vec_dims.size() != out_new_dims.size()) {
-      x_mem_desc = GetExtendedMemoryDescriptor(x_mem_desc, x_vec_dims,
-                                               out_new_dims.size());
+      x_vec_dims = GetExtendedXDims(x_vec_dims, out_new_dims.size());
     }
 
     out->Resize(phi::make_ddim(out_new_dims));
     paddle::platform::BroadcastDataMKLDNNHandler<T> handler(
-        dnnl::algorithm::binary_add, onednn_engine, ctx.GetPlace(), out, x,
-        0.0f, 1.0f, x_mem_desc);
+        dnnl::algorithm::binary_add, onednn_engine, ctx.GetPlace(), x, out,
+        0.0f, 1.0f, x_vec_dims);
 
     auto src_memory_p = handler.AcquireSrcMemory(x);
-    auto dst_memory_p = handler.AcquireDstMemory(out);  // acquires zeroed mem
+    auto dst_memory_p = handler.AcquireZeroedDstMemory(out);
     auto binary_p = handler.AcquireForwardPrimitive();
 
     const std::unordered_map<int, dnnl::memory> args = {
@@ -73,14 +71,13 @@ class ExpandMKLDNNKernel : public paddle::framework::OpKernel<T> {
   }
 
  private:
-  dnnl::memory::desc GetExtendedMemoryDescriptor(
-      const dnnl::memory::desc& x_mem_desc,
-      const std::vector<int64_t>& x_vec_dims, int new_size) const {
-    std::vector<int64_t> new_dims(new_size, 1);
+  std::vector<int64_t> GetExtendedXDims(const std::vector<int64_t>& x_vec_dims,
+                                        int new_size) const {
+    std::vector<int64_t> extended_x_dims(new_size, 1);
     std::copy(x_vec_dims.begin(), x_vec_dims.end(),
-              new_dims.begin() + new_size - x_vec_dims.size());
+              extended_x_dims.begin() + new_size - x_vec_dims.size());
 
-    return x_mem_desc.reshape(new_dims);
+    return extended_x_dims;
   }
 };
 
diff --git a/paddle/fluid/operators/nanmedian_op.cc b/paddle/fluid/operators/nanmedian_op.cc
new file mode 100644
index 0000000000000..23a497bdb1d3d
--- /dev/null
+++ b/paddle/fluid/operators/nanmedian_op.cc
@@ -0,0 +1,125 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace paddle {
+namespace operators {
+
+class NanmedianOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class NanmedianOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor), "
+             "the input feature data of NanmedianOp, dtype should be"
+             "int32, int64, float16, float32 or float64.");
+    AddOutput(
+        "MedianIndex",
+        "Store the index position of median values, The calculation differs "
+        "in the odd or even valid elements numbers."
+        "Along the axis, two elements contributed to the median value in "
+        "each row."
+        "If the amount of valid elements were even, both were the same.")
+        .AsIntermediate()
+        .AsExtra();
+    AddOutput("Out",
+              "(Tensor),"
+              " the output of  NanmedianOp, whose dtype is the same as X");
+    AddAttr<bool>("keepdim",
+                  "(bool, default true) "
+                  "If true, retain the reduced axis with length 1.")
+        .SetDefault(true);
+    AddAttr<std::vector<int>>("axis",
+                              "(std::vector<int>). List of integers,"
+                              " indicating the dimensions to calculate medians")
+        .SetDefault({});
+    AddComment(R"DOC(
+                Nanmedian operator
+
+                This operator is considered as an extention of median operation,
+                which supports specifically the case of NaN values in the input.
+
+                If all the elements in input are NaN it will also return NaN.
+                If no elements in input are Nan, this op is identical to thie median op.
+
+                If the valid count of elements is a even number, the average value of
+                the elements in the middle is calculated as the median.
+
+                This operator can also supports multiple axis.
+        )DOC");
+  }
+};
+
+template <typename T>
+class NanmedianGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("nanmedian_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("MedianIndex", this->Output("MedianIndex"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+class NanmedianGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.GetPlace());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(nanmedian, NanmedianInferShapeFunctor,
+                            PD_INFER_META(phi::NanmedianInferMeta));
+
+REGISTER_OPERATOR(nanmedian, ops::NanmedianOp, ops::NanmedianOpMaker,
+                  ops::NanmedianGradMaker<paddle::framework::OpDesc>,
+                  ops::NanmedianGradMaker<paddle::imperative::OpBase>,
+                  NanmedianInferShapeFunctor);
+
+DECLARE_INFER_SHAPE_FUNCTOR(nanmedian_grad, NanmedianGradInferShapeFunctor,
+                            PD_INFER_META(phi::NanmedianGradInferMeta));
+
+REGISTER_OPERATOR(nanmedian_grad, ops::NanmedianGradOp,
+                  NanmedianGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/one_hot_v2_op_mlu.cc b/paddle/fluid/operators/one_hot_v2_op_mlu.cc
new file mode 100644
index 0000000000000..855cdda963cb6
--- /dev/null
+++ b/paddle/fluid/operators/one_hot_v2_op_mlu.cc
@@ -0,0 +1,86 @@
+
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/operators/utils.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename T>
+class OneHotV2MLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::MLUDeviceContext>();
+    auto* in = ctx.Input<LoDTensor>("X");
+    auto* out = ctx.Output<LoDTensor>("Out");
+    int depth = ctx.Attr<int>("depth");
+    if (ctx.HasInput("depth_tensor")) {
+      std::vector<int32_t> depth_data;
+      depth_data = GetDataFromTensor<int>(ctx.Input<Tensor>("depth_tensor"));
+      depth = depth_data[0];
+
+      auto out_dims = out->dims();
+      out_dims[out_dims.size() - 1] = depth;
+      out->Resize(out_dims);
+    }
+    out->mutable_data<float>(ctx.GetPlace());
+
+    float on_value = 1.0f, off_value = 0.0f;
+    const int in_off_dim[1] = {1};
+    Tensor on_value_tensor = ctx.AllocateTmpTensor<float, MLUDeviceContext>(
+        framework::DDim(in_off_dim, 1), dev_ctx);
+    Tensor off_value_tensor = ctx.AllocateTmpTensor<float, MLUDeviceContext>(
+        framework::DDim(in_off_dim, 1), dev_ctx);
+    FillMLUTensorWithHostValue(ctx, on_value, &on_value_tensor);
+    FillMLUTensorWithHostValue(ctx, off_value, &off_value_tensor);
+
+    if (framework::TransToProtoVarType(in->dtype()) ==
+        framework::proto::VarType::INT32) {
+      MLUCnnlTensorDesc desc_indices(*in);
+      MLUCnnl::OneHot(ctx, desc_indices.get(), GetBasePtr(in), depth,
+                      GetBasePtr(&on_value_tensor),
+                      GetBasePtr(&off_value_tensor), -1,
+                      ToCnnlDataType(out->dtype()), GetBasePtr(out));
+    } else {
+      Tensor transformed_in;
+      transformed_in.mutable_data<int32_t>(in->dims(), dev_ctx.GetPlace());
+      // use cnnlCast to cast int64_t to int32_t then do one_hot
+      MLUCnnlTensorDesc in_desc(*in);
+      MLUCnnlTensorDesc transformed_in_desc(transformed_in);
+      cnnlCastDataType_t cast_type = GetCastDataType(
+          framework::TransToProtoVarType(in->dtype()),
+          framework::TransToProtoVarType(transformed_in.dtype()));
+      MLUCnnl::Cast(ctx, cast_type, in_desc.get(), GetBasePtr(in),
+                    transformed_in_desc.get(), GetBasePtr(&transformed_in));
+      MLUCnnl::OneHot(
+          ctx, transformed_in_desc.get(), GetBasePtr(&transformed_in), depth,
+          GetBasePtr(&on_value_tensor), GetBasePtr(&off_value_tensor), -1,
+          ToCnnlDataType(out->dtype()), GetBasePtr(out));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(one_hot_v2, ops::OneHotV2MLUKernel<int32_t>);
diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
index 0c174b0825c9f..94d8cc41d3f31 100644
--- a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
+++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
@@ -29,11 +29,11 @@ inline std::vector<int64_t> CalculateReducedDims(
     bool reduce_all, bool keep_dim) {
   if (keep_dim) return phi::vectorize(output->dims());
 
-  if (reduce_all)
-    return std::vector<int64_t>(phi::vectorize(input->dims()).size(), 1);
+  if (reduce_all) return std::vector<int64_t>(input->dims().size(), 1);
 
   std::vector<int64_t> output_dims(phi::vectorize(input->dims()));
   for (size_t i = 0; i < reduce_dims.size(); ++i) {
+    // handle negative dims, f.e. "-1" means rightmost dimension
     reduce_dims[i] = (reduce_dims[i] >= 0)
                          ? reduce_dims[i]
                          : input->dims().size() + reduce_dims[i];
@@ -52,16 +52,16 @@ class ReduceMKLDNNKernel : public framework::OpKernel<T> {
         ctx.template device_context<platform::MKLDNNDeviceContext>();
     const auto& onednn_engine = dev_ctx.GetEngine();
 
-    const auto* input = ctx.Input<LoDTensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
+    const auto* x = ctx.Input<LoDTensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
 
     auto reduce_dims = ctx.Attr<std::vector<int>>("dim");
     bool reduce_all = ctx.Attr<bool>("reduce_all");
     bool keep_dim = ctx.Attr<bool>("keep_dim");
 
-    auto output_dims =
-        CalculateReducedDims(input, output, reduce_dims, reduce_all, keep_dim);
-    auto input_dims = phi::vectorize(input->dims());
+    auto x_tz = phi::vectorize(x->dims());
+    auto out_tz =
+        CalculateReducedDims(x, out, reduce_dims, reduce_all, keep_dim);
 
     auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
 
@@ -69,18 +69,19 @@ class ReduceMKLDNNKernel : public framework::OpKernel<T> {
     // copied without actual reduction.
     // In that case reorder must be executed to maintain compatibility with
     // PaddlePaddle reduce op
-    if (input_dims == output_dims) {
-      dnnl::memory::data_type input_type = framework::ToMKLDNNDataType(
-          framework::TransToProtoVarType(input->dtype()));
+    if (x_tz == out_tz) {
+      dnnl::memory::data_type x_type = framework::ToMKLDNNDataType(
+          framework::TransToProtoVarType(x->dtype()));
       platform::ReorderMKLDNNHandler reorder_handler(
-          input_dims, framework::TransToProtoVarType(input->dtype()),
-          input_type, onednn_engine);
+          x_tz, framework::TransToProtoVarType(x->dtype()), x_type,
+          onednn_engine);
 
       auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
-          input->mem_desc(), platform::to_void_cast(input->data<T>()));
+          x->mem_desc(), platform::to_void_cast(x->data<T>()));
 
-      auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
-          output, input->mem_desc(), ctx.GetPlace());
+      // reuse mem desc since it is a simple copy
+      auto reorder_dst_memory_p =
+          reorder_handler.AcquireDstMemory(out, x->mem_desc(), ctx.GetPlace());
 
       auto reorder_p = reorder_handler.AcquireReorder(reorder_src_memory_p,
                                                       reorder_dst_memory_p);
@@ -88,15 +89,15 @@ class ReduceMKLDNNKernel : public framework::OpKernel<T> {
       reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
       astream.wait();
 
-      output->set_mem_desc(reorder_dst_memory_p->get_desc().reshape(
-          phi::vectorize<int64_t>(output->dims())));
+      out->set_mem_desc(reorder_dst_memory_p->get_desc().reshape(
+          phi::vectorize<int64_t>(out->dims())));
     } else {
       platform::ReductionMKLDNNHandler<T> handler(reduction_type, 0.0f, 0.0f,
                                                   onednn_engine, ctx.GetPlace(),
-                                                  input, output, output_dims);
+                                                  x, out, out_tz);
 
-      auto src_memory_p = handler.AcquireSrcMemory(input);
-      auto dst_memory_p = handler.AcquireDstMemory(output);
+      auto src_memory_p = handler.AcquireSrcMemory(x);
+      auto dst_memory_p = handler.AcquireDstMemory(out);
 
       std::unordered_map<int, dnnl::memory> reduction_args = {
           {DNNL_ARG_SRC, *src_memory_p}, {DNNL_ARG_DST, *dst_memory_p}};
@@ -105,8 +106,9 @@ class ReduceMKLDNNKernel : public framework::OpKernel<T> {
 
       reduction_p->execute(astream, reduction_args);
       astream.wait();
-      output->set_mem_desc(dst_memory_p->get_desc().reshape(
-          phi::vectorize<int64_t>(output->dims())));
+
+      out->set_mem_desc(dst_memory_p->get_desc().reshape(
+          phi::vectorize<int64_t>(out->dims())));
     }
   }
 };
@@ -127,22 +129,15 @@ class ReduceGradMKLDNNKernel : public framework::OpKernel<T> {
     const auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
 
-    const auto input_dims =
-        CalculateReducedDims(dx, dout, dims, reduce_all, keep_dim);
-    const auto output_dims = phi::vectorize(dx->dims());
-
-    auto dout_mem_desc = dout->mem_desc();
-
-    if (input_dims != output_dims) {
-      dout_mem_desc = dout_mem_desc.reshape(input_dims);
-    }
+    auto dout_tz = CalculateReducedDims(dx, dout, dims, reduce_all, keep_dim);
+    auto dx_tz = phi::vectorize(dx->dims());
 
-    platform::BroadcastDataMKLDNNHandler<T> handler(
-        binary_type, onednn_engine, ctx.GetPlace(), dx, dout, scale_x, scale_y,
-        dout_mem_desc);
+    platform::BroadcastDataMKLDNNHandler<T> handler(binary_type, onednn_engine,
+                                                    ctx.GetPlace(), dout, dx,
+                                                    scale_x, scale_y, dout_tz);
 
     const auto src_memory_p = handler.AcquireSrcMemory(dout);
-    const auto dst_memory_p = handler.AcquireDstMemory(dx);
+    const auto dst_memory_p = handler.AcquireZeroedDstMemory(dx);
     const auto binary_prim = handler.AcquireForwardPrimitive();
 
     const std::unordered_map<int, dnnl::memory> args = {
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
index 04660fb501142..e3d8d15a305a9 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
@@ -112,6 +112,8 @@ class ReduceMaxGradNPUKernel : public framework::OpKernel<T> {
     auto* x = context.Input<Tensor>("X");
     auto* out = context.Input<Tensor>("Out");
     auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto reduce_dims = context.Attr<std::vector<int>>("dim");
+    bool reduce_all = context.Attr<bool>("reduce_all");
     int in_dtype = context.Attr<int>("in_dtype");
 
     PADDLE_ENFORCE_EQ(
@@ -129,12 +131,30 @@ class ReduceMaxGradNPUKernel : public framework::OpKernel<T> {
 
     // broadcast
     auto x_dims_vec = phi::vectorize(x->dims());
+    if (reduce_all) {
+      reduce_dims.clear();
+      for (size_t d = 0; d < x_dims_vec.size(); ++d) {
+        reduce_dims.push_back(static_cast<int>(d));
+      }
+    }
+
+    Tensor tmp_out, tmp_out_grad;
+    auto tmp_out_dims_vec = x_dims_vec;
+    for (auto d : reduce_dims) {
+      tmp_out_dims_vec[d] = 1;
+    }
+
+    tmp_out.ShareDataWith(*out);
+    tmp_out.Resize(phi::make_ddim(tmp_out_dims_vec));
+    tmp_out_grad.ShareDataWith(*out_grad);
+    tmp_out_grad.Resize(phi::make_ddim(tmp_out_dims_vec));
+
     Tensor transformed_out(x->type());
     transformed_out.Resize(phi::make_ddim(x_dims_vec));
     transformed_out.mutable_data<T>(place);
     NpuOpRunner r_brd_out;
     r_brd_out.SetType("BroadcastTo")
-        .AddInput(*out)
+        .AddInput(tmp_out)
         .AddInput(std::move(x_dims_vec))
         .AddOutput(transformed_out)
         .Run(stream);
@@ -143,7 +163,7 @@ class ReduceMaxGradNPUKernel : public framework::OpKernel<T> {
     transformed_out_grad.mutable_data<T>(place);
     NpuOpRunner r_brd_out_grad;
     r_brd_out_grad.SetType("BroadcastTo")
-        .AddInput(*out_grad)
+        .AddInput(tmp_out_grad)
         .AddInput(std::move(x_dims_vec))
         .AddOutput(transformed_out_grad)
         .Run(stream);
diff --git a/paddle/fluid/operators/rrelu_op.cc b/paddle/fluid/operators/rrelu_op.cc
new file mode 100644
index 0000000000000..c543a088e9d7f
--- /dev/null
+++ b/paddle/fluid/operators/rrelu_op.cc
@@ -0,0 +1,126 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class RReluOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class RReluOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input of RReLU op.");
+    AddOutput("Out", "The output of RReLU op.");
+    AddOutput("Noise", "The random sampled RReLU noise.")
+        .AsIntermediate()
+        .AsExtra();
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
+    float default_lower = 1. / 8.;
+    AddAttr<float>("lower", "Lower bound of the uniform distribution.")
+        .SetDefault(default_lower)
+        .AddCustomChecker([](const float& lower) {
+          PADDLE_ENFORCE_EQ(lower >= 0.0f && lower < 1.0f, true,
+                            platform::errors::InvalidArgument(
+                                "'RRelu_lower' must be between 0.0 and 1.0."));
+        });
+    float defalut_upper = 1. / 3.;
+    AddAttr<float>("upper", "Upper bound of the uniform distribution.")
+        .SetDefault(defalut_upper)
+        .AddCustomChecker([](const float& upper) {
+          PADDLE_ENFORCE_EQ(upper > 0.0f && upper <= 1.0f, true,
+                            platform::errors::InvalidArgument(
+                                "'RRelu_upper' must be between 0.0 and 1.0."));
+        });
+    AddComment(R"DOC(
+RReLU Operator.
+
+Applies the randomized leaky rectified liner unit function, element-wise,
+as described in the paper:
+
+`Empirical Evaluation of Rectified Activations in Convolutional Network`_.
+
+The function is defined as:
+
+.. math::
+    \text{RReLU}(x) =
+    \begin{cases}
+        x & \text{if } x \geq 0 \\
+        ax & \text{ otherwise }
+    \end{cases}
+
+where :math:`a` is randomly sampled from uniform distribution
+:math:`\mathcal{U}(\text{lower}, \text{upper})`.
+
+ See: https://arxiv.org/pdf/1505.00853.pdf
+
+)DOC");
+  }
+};
+
+class RReluGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+};
+
+template <typename T>
+class RReluGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("rrelu_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Noise", this->Output("Noise"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(rrelu, RReluInferShapeFunctor,
+                            PD_INFER_META(phi::RReluInferMeta));
+
+REGISTER_OPERATOR(rrelu, ops::RReluOp, ops::RReluOpMaker,
+                  ops::RReluGradOpMaker<paddle::framework::OpDesc>,
+                  ops::RReluGradOpMaker<paddle::imperative::OpBase>,
+                  RReluInferShapeFunctor);
+
+DECLARE_INFER_SHAPE_FUNCTOR(rrelu_grad, RReluGradInferShapeFunctor,
+                            PD_INFER_META(phi::RReluGradInferMeta));
+REGISTER_OPERATOR(rrelu_grad, ops::RReluGradOp, RReluGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/softmax_op_mlu.cc b/paddle/fluid/operators/softmax_op_mlu.cc
index 9cb698e94fc56..9b97e779f29ef 100644
--- a/paddle/fluid/operators/softmax_op_mlu.cc
+++ b/paddle/fluid/operators/softmax_op_mlu.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <cnnlSoftmaxAlgorithm_t softmax_algo, typename T>
 class SoftmaxMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -45,7 +45,7 @@ class SoftmaxMLUKernel : public framework::OpKernel<T> {
       regard_in_shape = {d1, d2, d3};
     }
 
-    static const cnnlSoftmaxAlgorithm_t algo = CNNL_SOFTMAX_ACCURATE;
+    static const cnnlSoftmaxAlgorithm_t algo = softmax_algo;
     MLUCnnlTensorDesc in_desc(cnnl_softmax_dims, regard_in_shape.data(),
                               ToCnnlDataType<T>());
     MLUCnnl::SoftmaxForward(ctx, algo, mode, NULL, in_desc.get(),
@@ -54,7 +54,7 @@ class SoftmaxMLUKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <cnnlSoftmaxAlgorithm_t softmax_algo, typename T>
 class SoftmaxGradMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -82,7 +82,7 @@ class SoftmaxGradMLUKernel : public framework::OpKernel<T> {
       regard_out_shape = {d1, d2, d3};
     }
 
-    static const cnnlSoftmaxAlgorithm_t algo = CNNL_SOFTMAX_ACCURATE;
+    static const cnnlSoftmaxAlgorithm_t algo = softmax_algo;
     MLUCnnlTensorDesc out_desc(cnnl_softmax_dims, regard_out_shape.data(),
                                ToCnnlDataType<T>());
     MLUCnnl::SoftmaxBackward(ctx, algo, mode, out_desc.get(), GetBasePtr(out),
@@ -97,7 +97,16 @@ class SoftmaxGradMLUKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_MLU_KERNEL(softmax, ops::SoftmaxMLUKernel<float>,
-                       ops::SoftmaxMLUKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(softmax_grad, ops::SoftmaxGradMLUKernel<float>,
-                       ops::SoftmaxGradMLUKernel<paddle::platform::float16>);
+REGISTER_OP_MLU_KERNEL(
+    softmax, ops::SoftmaxMLUKernel<CNNL_SOFTMAX_ACCURATE, float>,
+    ops::SoftmaxMLUKernel<CNNL_SOFTMAX_ACCURATE, plat::float16>);
+REGISTER_OP_MLU_KERNEL(softmax_grad,
+                       ops::SoftmaxGradMLUKernel<CNNL_SOFTMAX_ACCURATE, float>,
+                       ops::SoftmaxGradMLUKernel<CNNL_SOFTMAX_ACCURATE,
+                                                 paddle::platform::float16>);
+REGISTER_OP_MLU_KERNEL(
+    log_softmax, ops::SoftmaxMLUKernel<CNNL_SOFTMAX_LOG, float>,
+    ops::SoftmaxMLUKernel<CNNL_SOFTMAX_ACCURATE, plat::float16>);
+REGISTER_OP_MLU_KERNEL(
+    log_softmax_grad, ops::SoftmaxGradMLUKernel<CNNL_SOFTMAX_LOG, float>,
+    ops::SoftmaxGradMLUKernel<CNNL_SOFTMAX_LOG, paddle::platform::float16>);
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
index 5410638ceb39a..8c04e935134c7 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -225,9 +225,9 @@ class RecordedGpuMallocHelper {
     if (UNLIKELY(malloc_managed_memory)) {
       result = cudaMallocManaged(ptr, size);
     } else {
-      VLOG(10) << "[cudaMalloc] size=" << static_cast<double>(size) / (1 << 20)
-               << " MB";
       result = cudaMalloc(ptr, size);
+      VLOG(10) << "[cudaMalloc] size=" << static_cast<double>(size) / (1 << 20)
+               << " MB, result=" << result;
     }
 #endif
     if (result == gpuSuccess) {
diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h
index 61ea0fd3cd293..d0cb9c953a5bf 100644
--- a/paddle/fluid/platform/device/gpu/nccl_helper.h
+++ b/paddle/fluid/platform/device/gpu/nccl_helper.h
@@ -31,6 +31,8 @@
 #ifdef PADDLE_WITH_RCCL
 #include "paddle/fluid/platform/dynload/rccl.h"
 #endif
+#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
 
@@ -52,6 +54,10 @@ inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) {
     return ncclFloat16;
   } else if (type == framework::proto::VarType::INT8) {
     return ncclInt8;
+#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000
+  } else if (type == framework::proto::VarType::BF16) {
+    return ncclBfloat16;
+#endif
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
         "This datatype in nccl is not supported."));
@@ -69,6 +75,10 @@ inline ncclDataType_t ToNCCLDataType(experimental::DataType type) {
     return ncclInt64;
   } else if (type == experimental::DataType::FLOAT16) {
     return ncclFloat16;
+#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000
+  } else if (type == experimental::DataType::BFLOAT16) {
+    return ncclBfloat16;
+#endif
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
         "This datatype in nccl is not supported."));
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 13b5005a30fa0..5476d244f6035 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -616,29 +616,17 @@ class BinaryMKLDNNHandler
  public:
   BinaryMKLDNNHandler(const dnnl::algorithm algo, const int axis,
                       const dnnl::engine engine, platform::Place cpu_place,
-                      const Tensor* x, const Tensor* y, Tensor* z,
-                      float scale_x, float scale_y, float scale_z,
+                      const Tensor* x, const Tensor* y, Tensor* out,
+                      float scale_x, float scale_y, float scale_out,
                       const dnnl::post_ops& post_ops = dnnl::post_ops{})
       : platform::MKLDNNHandlerNoCachingT<T, dnnl::binary>(engine, cpu_place) {
-    PADDLE_ENFORCE_EQ(
-        x->layout(), DataLayout::kMKLDNN,
-        platform::errors::InvalidArgument(
-            "Wrong layout set for X tensor. Expected: %d (kMKLDNN), Actual: %d",
-            DataLayout::kMKLDNN, x->layout()));
-
-    PADDLE_ENFORCE_EQ(
-        y->layout(), DataLayout::kMKLDNN,
-        platform::errors::InvalidArgument(
-            "Wrong layout set for Y tensor. Expected: %d (kMKLDNN), Actual: %d",
-            DataLayout::kMKLDNN, y->layout()));
-
     const auto src_x_tz = phi::vectorize(x->dims());
     const auto src_y_tz = phi::vectorize(y->dims());
     // if output tensor(z) is nullptr then we are computing into oneDNN
     // managed buffer
     auto rankdiff = x->dims().size() - y->dims().size();
-    const auto dst_tz = (z == nullptr) ? (rankdiff > 0 ? src_x_tz : src_y_tz)
-                                       : phi::vectorize(z->dims());
+    const auto dst_tz = (out == nullptr) ? (rankdiff > 0 ? src_x_tz : src_y_tz)
+                                         : phi::vectorize(out->dims());
 
     auto src0_md = x->mem_desc();
     auto src1_md = y->mem_desc();
@@ -667,7 +655,7 @@ class BinaryMKLDNNHandler
                                      MKLDNNMemoryFormat::any);
 
     auto attributes =
-        CreateAttributes(algo, scale_x, scale_y, scale_z, post_ops);
+        CreateAttributes(algo, scale_x, scale_y, scale_out, post_ops);
 
     this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md, src1_md,
                                             dst_md);
@@ -681,7 +669,7 @@ class BinaryMKLDNNHandler
 
  private:
   static inline dnnl::primitive_attr CreateAttributes(
-      dnnl::algorithm op, float scale_x, float scale_y, float scale_z,
+      dnnl::algorithm op, float scale_x, float scale_y, float scale_out,
       dnnl::post_ops post_ops = dnnl::post_ops{}) {
     // Scales set in attributes for inputs contibute to the output equation
     // in the following way (assuming no broadcasting takes place):
@@ -699,9 +687,9 @@ class BinaryMKLDNNHandler
     // For mul operation on the other hand
     // output = (scale_out / scale_x) * x * (1.0 / scale_y) * y
     //                <scale_0>                 <scale_1>
-    float scale_0 = scale_z / scale_x;
+    float scale_0 = scale_out / scale_x;
     float scale_1 =
-        op == dnnl::algorithm::binary_add ? scale_z / scale_y : 1.0 / scale_y;
+        op == dnnl::algorithm::binary_add ? scale_out / scale_y : 1.0 / scale_y;
     dnnl::primitive_attr attributes;
     attributes.set_scales(/* input_x_id = */ DNNL_ARG_SRC_0, /* mask = */ 0,
                           {scale_0});
@@ -718,21 +706,15 @@ class BroadcastDataMKLDNNHandler
  public:
   BroadcastDataMKLDNNHandler(const dnnl::algorithm algo,
                              const dnnl::engine engine,
-                             platform::Place cpu_place, const Tensor* out,
-                             const Tensor* x, float scale_x, float scale_y,
-                             const dnnl::memory::desc& x_mem_desc)
+                             platform::Place cpu_place, const Tensor* x,
+                             Tensor* out, float scale_x, float scale_y,
+                             const std::vector<int64_t>& extended_x_dims)
       : platform::MKLDNNHandlerNoCachingT<T, dnnl::binary>(engine, cpu_place) {
-    PADDLE_ENFORCE_EQ(
-        x->layout(), DataLayout::kMKLDNN,
-        platform::errors::InvalidArgument("Wrong layout set for X tensor."));
-
     const auto src0_tz = phi::vectorize(out->dims());
-
     const auto src0_md =
         dnnl::memory::desc(src0_tz, platform::MKLDNNGetDataType<T>(),
                            platform::GetPlainMKLDNNFormat(src0_tz.size()));
-
-    const auto src1_md = x_mem_desc;
+    const auto src1_md = x->mem_desc().reshape(extended_x_dims);
 
     dnnl::primitive_attr attributes;
     attributes.set_scales(DNNL_ARG_SRC_0, 0, {scale_x});
@@ -743,9 +725,9 @@ class BroadcastDataMKLDNNHandler
   }
 
   template <typename T_out = T>
-  std::shared_ptr<dnnl::memory> AcquireDstMemory(framework::Tensor* output) {
-    T_out* ptr = output->mutable_data<T_out>(
-        this->place_, this->fwd_pd_->dst_desc().get_size());
+  std::shared_ptr<dnnl::memory> AcquireZeroedDstMemory(framework::Tensor* out) {
+    T_out* ptr = out->mutable_data<T_out>(this->place_,
+                                          this->fwd_pd_->dst_desc().get_size());
     memset(ptr, 0, this->fwd_pd_->dst_desc().get_size());
     return this->AcquireMemoryFromPrimitive(this->fwd_pd_->dst_desc(), ptr);
   }
@@ -758,22 +740,18 @@ class ReductionMKLDNNHandler
   ReductionMKLDNNHandler(const dnnl::algorithm algo, const float p,
                          const float eps, const dnnl::engine engine,
                          platform::Place cpu_place, const Tensor* x,
-                         const Tensor* y, std::vector<int64_t> y_tz,
-                         const dnnl::primitive_attr& attr = NULL)
+                         const Tensor* out, std::vector<int64_t> out_tz,
+                         const dnnl::primitive_attr& attrs = NULL)
       : platform::MKLDNNHandlerNoCachingT<T, dnnl::reduction>(engine,
                                                               cpu_place) {
-    PADDLE_ENFORCE_EQ(
-        x->layout(), DataLayout::kMKLDNN,
-        platform::errors::InvalidArgument("Wrong layout set for X tensor."));
-
-    const auto y_md = memory::desc(y_tz, platform::MKLDNNGetDataType<T>(),
-                                   dnnl::memory::format_tag::any);
+    const auto out_md = memory::desc(out_tz, platform::MKLDNNGetDataType<T>(),
+                                     dnnl::memory::format_tag::any);
 
-    if (attr)
-      this->AcquireForwardPrimitiveDescriptor(attr, algo, x->mem_desc(), y_md,
-                                              p, eps);
+    if (attrs)
+      this->AcquireForwardPrimitiveDescriptor(attrs, algo, x->mem_desc(),
+                                              out_md, p, eps);
     else
-      this->AcquireForwardPrimitiveDescriptor(algo, x->mem_desc(), y_md, p,
+      this->AcquireForwardPrimitiveDescriptor(algo, x->mem_desc(), out_md, p,
                                               eps);
   }
 };
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index d24c0355c2493..954bac00ddbd7 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -2224,9 +2224,9 @@ void BindImperative(py::module *m_ptr) {
       },
       py::call_guard<py::gil_scoped_release>());
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||     \
-    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
-    defined(PADDLE_WITH_CNCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||          \
+    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL) || \
+    defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_CNCL)
   py::class_<imperative::ParallelContext,
              std::shared_ptr<imperative::ParallelContext>>(m,
                                                            "ParallelContext");
diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h
index 2b849968c76f9..972e8aafab758 100644
--- a/paddle/fluid/pybind/op_function_generator.h
+++ b/paddle/fluid/pybind/op_function_generator.h
@@ -32,10 +32,16 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"fused_attention",
      {"X", "LnScale", "LnBias", "QKVW", "QKVBias", "CacheKV", "SrcMask",
       "OutLinearW", "OutLinearBias", "Ln2Scale", "Ln2Bias"}},
+    {"fused_gate_attention",
+     {"Query", "Key", "QueryWeight", "KeyWeight", "ValueWeight", "QKVWeight",
+      "NonbatchedBias", "SrcMask", "GateWeight", "GateBias", "OutLinearWeight",
+      "OutLinearBias"}},
     {"fused_multi_transformer",
      {"X", "LnScale", "LnBias", "QKVW", "QKVBias", "CacheKV", "TimeStep",
       "SrcMask", "OutLinearW", "OutLinearBias", "FFNLnScale", "FFNLnBias",
       "FFN1Weight", "FFN1Bias", "FFN2Weight", "FFN2Bias"}},
+    {"fused_bias_dropout_residual_layer_norm",
+     {"X", "Residual", "Bias", "LnScale", "LnBias"}},
     {"instance_norm", {"X", "Scale", "Bias"}},
     {"gru_unit", {"Input", "HiddenPrev", "Weight", "Bias"}},
     {"label_smooth", {"X", "PriorDist"}},
@@ -119,6 +125,11 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
      {"X", "Scale", "Bias", "Mean", "Variance", "MomentumTensor"}},
     {"inplace_abn",
      {"X", "Scale", "Bias", "Mean", "Variance", "MomentumTensor"}},
+    {"linear_interp", {"X", "OutSize"}},
+    {"bilinear_interp", {"X", "OutSize"}},
+    {"trilinear_interp", {"X", "OutSize"}},
+    {"nearest_interp", {"X", "OutSize"}},
+    {"bicubic_interp", {"X", "OutSize"}},
 };
 
 // NOTE(zhiqiu): Like op_ins_map.
@@ -148,6 +159,11 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
                          "DropoutMaskOut", "Ln2Mean",
                          "Ln2Variance",    "BiasDropoutResidualOut",
                          "CacheKVOut",     "Y"}},
+    {"fused_bias_dropout_residual_layer_norm",
+     {"BiasDropoutResidualOut", "DropoutMaskOut", "LnMean", "LnVariance", "Y"}},
+    {"fused_gate_attention",
+     {"QueryTransposeOut", "KeyTransposeOut", "ValueTransposeOut",
+      "QKVTransposeOut", "SoftmaxOut", "FMHAOut", "GateOut", "Out"}},
     {"sync_batch_norm",
      {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance",
       "ReserveSpace"}},
@@ -259,6 +275,7 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
     {"split", {"Out"}},
     {"concat", {"Out"}},
     {"fused_multi_transformer", {"CacheKVOut"}},
+    {"group_norm", {"Mean", "Variance"}},
 };
 
 // NOTE(pangyoki): Tensor View Strategy.
diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc
index b6431fcbe690e..3ef7763d57e8b 100644
--- a/paddle/phi/api/lib/api_custom_impl.cc
+++ b/paddle/phi/api/lib/api_custom_impl.cc
@@ -531,6 +531,108 @@ Tensor conv2d_impl(const Tensor& input,
   return api_output;
 }
 
+Tensor conv3d_impl(const Tensor& input,
+                   const Tensor& filter,
+                   const std::vector<int>& strides,
+                   const std::vector<int>& paddings,
+                   const std::string& paddding_algorithm,
+                   int groups,
+                   const std::vector<int>& dilations,
+                   const std::string& data_format,
+                   bool use_addto,
+                   int workspace_size_MB,
+                   bool exhaustive_search) {
+  Backend kernel_backend = Backend::UNDEFINED;
+  DataLayout kernel_layout = DataLayout::UNDEFINED;
+  DataType kernel_data_type = DataType::UNDEFINED;
+
+  kernel_data_type = ParseDataType(input);
+
+  if (kernel_backend == Backend::UNDEFINED ||
+      kernel_layout == DataLayout::UNDEFINED ||
+      kernel_data_type == DataType::UNDEFINED) {
+    auto kernel_key_set = ParseKernelKeyByInputArgs(input, filter);
+    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
+    if (kernel_backend == Backend::UNDEFINED) {
+      kernel_backend = kernel_key.backend();
+    }
+    if (kernel_layout == DataLayout::UNDEFINED) {
+      kernel_layout = kernel_key.layout();
+    }
+    if (kernel_data_type == DataType::UNDEFINED) {
+      kernel_data_type = kernel_key.dtype();
+    }
+  }
+
+  VLOG(6) << "conv3d API kernel key: [" << kernel_backend << ", "
+          << kernel_layout << ", " << kernel_data_type << "]";
+  const auto& kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
+      "conv3d", {kernel_backend, kernel_layout, kernel_data_type}, true);
+  VLOG(6) << "conv3d API kernel: " << kernel;
+
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
+
+  phi::TensorArgDef args0 = kernel.InputAt(0);
+  phi::TensorArgDef args1 = kernel.InputAt(1);
+  if (kernel_backend == Backend::GPU) {
+    args0.backend = Backend::GPU;
+    args1.backend = Backend::GPU;
+  }
+
+  auto input_input = PrepareData(input, args0, {});
+  auto input_filter = PrepareData(filter, args1, {});
+
+  Tensor api_output;
+  auto kernel_out = SetKernelOutput(kernel_backend, &api_output);
+  phi::MetaTensor meta_out(kernel_out);
+
+  phi::ConvInferMeta(MakeMetaTensor(*input_input),
+                     MakeMetaTensor(*input_filter),
+                     strides,
+                     paddings,
+                     paddding_algorithm,
+                     groups,
+                     dilations,
+                     data_format,
+                     use_addto,
+                     workspace_size_MB,
+                     exhaustive_search,
+                     &meta_out);
+
+  using kernel_signature = void (*)(const platform::DeviceContext&,
+                                    const phi::DenseTensor&,
+                                    const phi::DenseTensor&,
+                                    const std::vector<int>&,
+                                    const std::vector<int>&,
+                                    const std::string&,
+                                    int,
+                                    const std::vector<int>&,
+                                    const std::string&,
+                                    bool,
+                                    int,
+                                    bool,
+                                    phi::DenseTensor*);
+  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+
+  {
+    (*kernel_fn)(*dev_ctx,
+                 *input_input,
+                 *input_filter,
+                 strides,
+                 paddings,
+                 paddding_algorithm,
+                 groups,
+                 dilations,
+                 data_format,
+                 use_addto,
+                 workspace_size_MB,
+                 exhaustive_search,
+                 kernel_out);
+  }
+
+  return api_output;
+}
+
 void conv2d_grad_impl(const Tensor& input,
                       const Tensor& filter,
                       const Tensor& out_grad,
@@ -632,12 +734,187 @@ void conv2d_grad_impl(const Tensor& input,
   }
 }
 
+void conv3d_grad_impl(const Tensor& input,
+                      const Tensor& filter,
+                      const Tensor& out_grad,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      const std::string& paddding_algorithm,
+                      int groups,
+                      const std::vector<int>& dilations,
+                      const std::string& data_format,
+                      bool use_addto,
+                      int workspace_size_MB,
+                      bool exhaustive_search,
+                      Tensor* input_grad,
+                      Tensor* filter_grad) {
+  Backend kernel_backend = Backend::UNDEFINED;
+  DataLayout kernel_layout = DataLayout::UNDEFINED;
+  DataType kernel_data_type = DataType::UNDEFINED;
+
+  if (kernel_backend == Backend::UNDEFINED ||
+      kernel_layout == DataLayout::UNDEFINED ||
+      kernel_data_type == DataType::UNDEFINED) {
+    auto kernel_key_set = ParseKernelKeyByInputArgs(input, filter, out_grad);
+    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
+    if (kernel_backend == Backend::UNDEFINED) {
+      kernel_backend = kernel_key.backend();
+    }
+    if (kernel_layout == DataLayout::UNDEFINED) {
+      kernel_layout = kernel_key.layout();
+    }
+    if (kernel_data_type == DataType::UNDEFINED) {
+      kernel_data_type = kernel_key.dtype();
+    }
+  }
+
+  VLOG(6) << "conv3d_grad API kernel key: [" << kernel_backend << ", "
+          << kernel_layout << ", " << kernel_data_type << "]";
+  const auto& kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
+      "conv3d_grad", {kernel_backend, kernel_layout, kernel_data_type}, true);
+  VLOG(6) << "conv3d_grad API kernel: " << kernel;
+
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
+
+  phi::TensorArgDef args0 = kernel.InputAt(0);
+  phi::TensorArgDef args1 = kernel.InputAt(1);
+  phi::TensorArgDef args2 = kernel.InputAt(2);
+  if (kernel_backend == Backend::GPU) {
+    args0.backend = Backend::GPU;
+    args1.backend = Backend::GPU;
+    args2.backend = Backend::GPU;
+  }
+
+  auto input_input = PrepareData(input, args0, {});
+  auto input_filter = PrepareData(filter, args1, {});
+  auto input_out_grad = PrepareData(out_grad, args2, {});
+
+  auto kernel_out_0 = SetKernelOutput(kernel_backend, input_grad);
+  auto kernel_out_1 = SetKernelOutput(kernel_backend, filter_grad);
+  phi::MetaTensor meta_out_0(kernel_out_0);
+  phi::MetaTensor meta_out_1(kernel_out_1);
+
+  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_input),
+                                  MakeMetaTensor(*input_filter),
+                                  kernel_out_0 ? &meta_out_0 : nullptr,
+                                  kernel_out_1 ? &meta_out_1 : nullptr);
+
+  using kernel_signature = void (*)(const platform::DeviceContext&,
+                                    const phi::DenseTensor&,
+                                    const phi::DenseTensor&,
+                                    const phi::DenseTensor&,
+                                    const std::vector<int>&,
+                                    const std::vector<int>&,
+                                    const std::string&,
+                                    int,
+                                    const std::vector<int>&,
+                                    const std::string&,
+                                    bool,
+                                    int,
+                                    bool,
+                                    phi::DenseTensor*,
+                                    phi::DenseTensor*);
+  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+
+  {
+    (*kernel_fn)(*dev_ctx,
+                 *input_input,
+                 *input_filter,
+                 *input_out_grad,
+                 strides,
+                 paddings,
+                 paddding_algorithm,
+                 groups,
+                 dilations,
+                 data_format,
+                 use_addto,
+                 workspace_size_MB,
+                 exhaustive_search,
+                 kernel_out_0,
+                 kernel_out_1);
+  }
+}
+
 Tensor copy_to_impl(const Tensor& x, Place place, bool blocking) {
   Tensor out;
   copy(x, place, blocking, &out);
   return out;
 }
 
+Tensor embedding_impl(const Tensor& x,
+                      const Tensor& weight,
+                      int64_t padding_idx,
+                      bool sparse) {
+  DataType kernel_data_type = ParseDataType(weight);
+  auto kernel_key_set = ParseKernelKeyByInputArgs(weight);
+  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
+  VLOG(6) << "embedding API kernel key: [" << kernel_key.backend() << ", "
+          << kernel_key.layout() << ", " << kernel_data_type << "]";
+
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
+
+  Tensor api_output;
+
+  if (phi::DenseTensor::classof(weight.impl().get())) {
+    const auto& kernel =
+        phi::KernelFactory::Instance().SelectKernelOrThrowError(
+            "embedding",
+            {kernel_key.backend(), kernel_key.layout(), kernel_data_type});
+    VLOG(6) << "embedding API kernel: " << kernel;
+
+    auto input_x = PrepareData(x, kernel.InputAt(0), {});
+    auto input_weight = PrepareData(weight, kernel.InputAt(1), {});
+
+    auto* kernel_out = SetKernelOutput(kernel_key.backend(), &api_output);
+    phi::MetaTensor meta_out(kernel_out);
+
+    phi::EmbeddingInferMeta(MakeMetaTensor(*input_x),
+                            MakeMetaTensor(*input_weight),
+                            padding_idx,
+                            sparse,
+                            &meta_out);
+
+    using kernel_signature = void (*)(const platform::DeviceContext&,
+                                      const phi::DenseTensor&,
+                                      const phi::DenseTensor&,
+                                      int64_t,
+                                      phi::DenseTensor*);
+    auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+    {
+      (*kernel_fn)(*dev_ctx, *input_x, *input_weight, padding_idx, kernel_out);
+    }
+  } else {
+    const auto& kernel =
+        phi::KernelFactory::Instance().SelectKernelOrThrowError(
+            "sparse_weight_embedding",
+            {kernel_key.backend(), kernel_key.layout(), kernel_data_type});
+    VLOG(6) << "sparse_weight_embedding API kernel: " << kernel;
+
+    auto input_x = PrepareData(x, kernel.InputAt(0), {});
+    auto input_weight = TensorToSelectedRows(weight);
+
+    auto* kernel_out = SetKernelOutput(kernel_key.backend(), &api_output);
+    phi::MetaTensor meta_out(kernel_out);
+
+    phi::EmbeddingInferMeta(MakeMetaTensor(*input_x),
+                            MakeMetaTensor(*input_weight),
+                            padding_idx,
+                            sparse,
+                            &meta_out);
+
+    using kernel_signature = void (*)(const platform::DeviceContext&,
+                                      const phi::DenseTensor&,
+                                      const phi::SelectedRows&,
+                                      int64_t,
+                                      phi::DenseTensor*);
+    auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+    {
+      (*kernel_fn)(*dev_ctx, *input_x, *input_weight, padding_idx, kernel_out);
+    }
+  }
+  return api_output;
+}
+
 std::vector<Tensor> split_impl(const Tensor& x,
                                const IntArray& num_or_sections,
                                const Scalar& axis) {
@@ -1176,6 +1453,125 @@ void imag_grad_impl(const Tensor& out_grad, Tensor* x_grad) {
   (*kernel_fn)(*dev_ctx, *dense_out_grad, kernel_out);
 }
 
+void embedding_grad_impl(const Tensor& x,
+                         const Tensor& weight,
+                         const Tensor& out_grad,
+                         int64_t padding_idx,
+                         bool sparse,
+                         Tensor* weight_grad) {
+  DataType kernel_data_type = ParseDataType(weight);
+  auto kernel_key_set = ParseKernelKeyByInputArgs(weight);
+  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
+  VLOG(6) << "embedding_grad API kernel key: [" << kernel_key.backend() << ", "
+          << kernel_key.layout() << ", " << kernel_data_type << "]";
+
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
+
+  if (phi::DenseTensor::classof(weight.impl().get())) {
+    std::string kernel_name =
+        sparse ? "embedding_sparse_grad" : "embedding_grad";
+    const auto& kernel =
+        phi::KernelFactory::Instance().SelectKernelOrThrowError(
+            kernel_name,
+            {kernel_key.backend(), kernel_key.layout(), kernel_data_type});
+    VLOG(6) << kernel_name << " API kernel: " << kernel;
+
+    auto input_x = PrepareData(x, kernel.InputAt(0), {});
+    auto input_weight = PrepareData(weight, kernel.InputAt(1), {});
+    auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
+
+    if (sparse) {
+      auto* kernel_out =
+          SetSelectedRowsKernelOutput(kernel_key.backend(), weight_grad);
+      phi::MetaTensor meta_out(kernel_out);
+      meta_out.set_dims(input_weight->dims());
+      meta_out.set_dtype(input_weight->dtype());
+      kernel_out->set_height(input_weight->dims()[0]);
+
+      using kernel_signature = void (*)(const platform::DeviceContext&,
+                                        const phi::DenseTensor&,
+                                        const phi::DenseTensor&,
+                                        const phi::DenseTensor&,
+                                        int64_t,
+                                        phi::SelectedRows*);
+      auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+      (*kernel_fn)(*dev_ctx,
+                   *input_x,
+                   *input_weight,
+                   *input_out_grad,
+                   padding_idx,
+                   kernel_out);
+    } else {
+      auto* kernel_out = SetKernelOutput(kernel_key.backend(), weight_grad);
+      phi::MetaTensor meta_out(kernel_out);
+      phi::UnchangedInferMeta(MakeMetaTensor(*input_weight), &meta_out);
+      using kernel_signature = void (*)(const platform::DeviceContext&,
+                                        const phi::DenseTensor&,
+                                        const phi::DenseTensor&,
+                                        const phi::DenseTensor&,
+                                        int64_t,
+                                        phi::DenseTensor*);
+      auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+      (*kernel_fn)(*dev_ctx,
+                   *input_x,
+                   *input_weight,
+                   *input_out_grad,
+                   padding_idx,
+                   kernel_out);
+    }
+  } else {
+    std::string kernel_name = sparse ? "sparse_weight_embedding_sparse_grad"
+                                     : "sparse_weight_embedding_grad";
+    const auto& kernel =
+        phi::KernelFactory::Instance().SelectKernelOrThrowError(
+            kernel_name,
+            {kernel_key.backend(), kernel_key.layout(), kernel_data_type});
+    VLOG(6) << kernel_name << " API kernel: " << kernel;
+
+    auto input_x = PrepareData(x, kernel.InputAt(0), {});
+    auto input_weight = TensorToSelectedRows(weight);
+    auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
+
+    if (sparse) {
+      auto* kernel_out =
+          SetSelectedRowsKernelOutput(kernel_key.backend(), weight_grad);
+      phi::MetaTensor meta_out(kernel_out);
+      phi::UnchangedInferMeta(MakeMetaTensor(*input_weight), &meta_out);
+      using kernel_signature = void (*)(const platform::DeviceContext&,
+                                        const phi::DenseTensor&,
+                                        const phi::SelectedRows&,
+                                        const phi::DenseTensor&,
+                                        int64_t,
+                                        phi::SelectedRows*);
+      auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+      (*kernel_fn)(*dev_ctx,
+                   *input_x,
+                   *input_weight,
+                   *input_out_grad,
+                   padding_idx,
+                   kernel_out);
+    } else {
+      auto* kernel_out = SetKernelOutput(kernel_key.backend(), weight_grad);
+      phi::MetaTensor meta_out(kernel_out);
+      meta_out.set_dims(input_weight->GetCompleteDims());
+      meta_out.set_dtype(input_weight->dtype());
+      using kernel_signature = void (*)(const platform::DeviceContext&,
+                                        const phi::DenseTensor&,
+                                        const phi::SelectedRows&,
+                                        const phi::DenseTensor&,
+                                        int64_t,
+                                        phi::DenseTensor*);
+      auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+      (*kernel_fn)(*dev_ctx,
+                   *input_x,
+                   *input_weight,
+                   *input_out_grad,
+                   padding_idx,
+                   kernel_out);
+    }
+  }
+}
+
 void real_grad_impl(const Tensor& out_grad, Tensor* x_grad) {
   phi::KernelKey kernel_key{ParseBackend(out_grad),
                             out_grad.layout(),
diff --git a/paddle/phi/api/lib/api_custom_impl.h b/paddle/phi/api/lib/api_custom_impl.h
index f8ccbb36c5ca7..22c5d193a2bcd 100644
--- a/paddle/phi/api/lib/api_custom_impl.h
+++ b/paddle/phi/api/lib/api_custom_impl.h
@@ -96,8 +96,25 @@ Tensor conv2d_impl(const Tensor& input,
                    int workspace_size_MB,
                    bool exhaustive_search);
 
+Tensor conv3d_impl(const Tensor& input,
+                   const Tensor& filter,
+                   const std::vector<int>& strides,
+                   const std::vector<int>& paddings,
+                   const std::string& paddding_algorithm,
+                   int groups,
+                   const std::vector<int>& dilations,
+                   const std::string& data_format,
+                   bool use_addto,
+                   int workspace_size_MB,
+                   bool exhaustive_search);
+
 Tensor copy_to_impl(const Tensor& x, Place place, bool blocking);
 
+Tensor embedding_impl(const Tensor& x,
+                      const Tensor& weight,
+                      int64_t padding_idx,
+                      bool sparse);
+
 std::vector<Tensor> split_impl(const Tensor& x,
                                const IntArray& num_or_sections,
                                const Scalar& axis);
@@ -143,8 +160,30 @@ void conv2d_grad_impl(const Tensor& input,
                       Tensor* input_grad,
                       Tensor* filter_grad);
 
+void conv3d_grad_impl(const Tensor& input,
+                      const Tensor& filter,
+                      const Tensor& out_grad,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      const std::string& paddding_algorithm,
+                      int groups,
+                      const std::vector<int>& dilations,
+                      const std::string& data_format,
+                      bool use_addto,
+                      int workspace_size_MB,
+                      bool exhaustive_search,
+                      Tensor* input_grad,
+                      Tensor* filter_grad);
+
 void imag_grad_impl(const Tensor& out_grad, Tensor* x_grad);
 
+void embedding_grad_impl(const Tensor& x,
+                         const Tensor& weight,
+                         const Tensor& out_grad,
+                         int64_t padding_idx,
+                         bool sparse,
+                         Tensor* weight_grad);
+
 void real_grad_impl(const Tensor& out_grad, Tensor* x_grad);
 
 }  // namespace experimental
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
index 78f8ff9e00ce5..f59ea5549bd71 100644
--- a/paddle/phi/infermeta/backward.cc
+++ b/paddle/phi/infermeta/backward.cc
@@ -313,10 +313,10 @@ void GumbelSoftmaxGradInferMeta(const MetaTensor& out,
 }
 
 void InstanceNormGradInferMeta(const MetaTensor& x,
-                               const MetaTensor& y_grad,
                                const MetaTensor& scale,
                                const MetaTensor& saved_mean,
                                const MetaTensor& saved_variance,
+                               const MetaTensor& y_grad,
                                float epsilon,
                                MetaTensor* x_grad,
                                MetaTensor* scale_grad,
@@ -433,6 +433,17 @@ void MultiplexGradInferMeta(const MetaTensor& ids,
   }
 }
 
+void NanmedianGradInferMeta(const MetaTensor& x,
+                            const MetaTensor& median_index,
+                            const MetaTensor& out_grad,
+                            const IntArray& axes,
+                            bool keep_dim,
+                            MetaTensor* x_grad) {
+  auto x_dims = x.dims();
+  x_grad->set_dims(x_dims);
+  x_grad->set_dtype(x.dtype());
+}
+
 void NllLossGradInferMeta(const MetaTensor& x,
                           const MetaTensor& label,
                           const MetaTensor& weight,
diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h
index b52734eb5b10c..0e7ed640d8ffb 100644
--- a/paddle/phi/infermeta/backward.h
+++ b/paddle/phi/infermeta/backward.h
@@ -145,10 +145,10 @@ void GumbelSoftmaxGradInferMeta(const MetaTensor& out,
                                 MetaTensor* dx);
 
 void InstanceNormGradInferMeta(const MetaTensor& x,
-                               const MetaTensor& y_grad,
                                const MetaTensor& scale,
                                const MetaTensor& saved_mean,
                                const MetaTensor& saved_variance,
+                               const MetaTensor& y_grad,
                                float epsilon,
                                MetaTensor* x_grad,
                                MetaTensor* scale_grad,
@@ -191,6 +191,13 @@ void MultiplexGradInferMeta(const MetaTensor& ids,
                             const MetaTensor& out_grad,
                             std::vector<MetaTensor*> ins_grad);
 
+void NanmedianGradInferMeta(const MetaTensor& x,
+                            const MetaTensor& median_index,
+                            const MetaTensor& out_grad,
+                            const IntArray& axes,
+                            bool keep_dim,
+                            MetaTensor* x_grad);
+
 void NllLossGradInferMeta(const MetaTensor& input,
                           const MetaTensor& label,
                           const MetaTensor& weight,
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 76b6fcdd52efc..a8d5ad564fe9b 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -983,6 +983,32 @@ void ElementwiseRawInferMeta(const MetaTensor& x,
   out->share_lod(x);
 }
 
+void EmbeddingInferMeta(const MetaTensor& x,
+                        const MetaTensor& weight,
+                        int64_t padding_idx,
+                        bool sparse,
+                        MetaTensor* out) {
+  const auto& table_dims = weight.dims();
+  const auto& ids_dims = x.dims();
+  int ids_rank = ids_dims.size();
+  VLOG(5) << "ids rank is " << ids_rank << std::endl;
+  PADDLE_ENFORCE_EQ(
+      table_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "ShapeError: The dimensions of the 'lookup table' must be 2. "
+          "But received lookup table's dimensions = %d, "
+          "lookup table's shape = [%s].",
+          table_dims.size(),
+          table_dims));
+
+  auto output_dims = phi::vectorize(ids_dims);
+  output_dims.push_back(table_dims[1]);
+  out->set_dims(phi::make_ddim(output_dims));
+  out->set_dtype(weight.dtype());
+  out->share_lod(x);
+}
+
 void ExpandAsInferMeta(const MetaTensor& x,
                        const MetaTensor& y,
                        const std::vector<int>& target_shape,
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index 0c86e5389c4b4..2cd34406fc2d2 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -154,6 +154,12 @@ void ElementwiseRawInferMeta(const MetaTensor& x_meta,
                              int axis,
                              MetaTensor* out);
 
+void EmbeddingInferMeta(const MetaTensor& x,
+                        const MetaTensor& weight,
+                        int64_t padding_idx,
+                        bool sparse,
+                        MetaTensor* out);
+
 void ExpandAsInferMeta(const MetaTensor& x,
                        const MetaTensor& y,
                        const std::vector<int>& target_shape,
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 1ec804d1bf822..0beb7223f212a 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -1246,6 +1246,65 @@ void MultinomialInferMeta(const MetaTensor& x,
   out->set_dtype(DataType::INT64);
 }
 
+void NanmedianInferMeta(const MetaTensor& x,
+                        const IntArray& axes,
+                        bool keep_dim,
+                        MetaTensor* out,
+                        MetaTensor* median_index) {
+  std::vector<int64_t> axis_list = axes.GetData();
+  auto x_dim = x.dims();
+  int64_t x_rank = x_dim.size();
+  out->set_dtype(x.dtype());
+  median_index->set_dtype(DataType::INT64);
+  median_index->set_dims(make_ddim({x.numel() * 2}));
+
+  std::vector<int32_t> out_dim;
+  if (axis_list.empty()) {
+    if (keep_dim) {
+      for (int64_t i = 0; i < x_rank; i++) {
+        out_dim.push_back(1);
+      }
+    } else {
+      out_dim.push_back(1);
+    }
+  } else {
+    std::vector<int64_t> cleaned_axis;
+    for (auto& axis : axis_list) {
+      if (axis < 0) axis += x_rank;
+
+      PADDLE_ENFORCE_LT(
+          axis,
+          x_rank,
+          errors::InvalidArgument(
+              "Attr(axis) value should be in range [-R, R-1], R is "
+              "the rank of Input(X). But received axis: %d, R: %d. "
+              "Current Input(X)'s shape is=[%s].",
+              axis,
+              x_rank,
+              x_dim));
+
+      PADDLE_ENFORCE_EQ(
+          std::find(cleaned_axis.begin(), cleaned_axis.end(), axis),
+          cleaned_axis.end(),
+          errors::InvalidArgument("Attr(axes) has duplicated elements: %d.",
+                                  static_cast<int>(axis)));
+
+      cleaned_axis.push_back(axis);
+    }
+
+    for (int64_t i = 0; i < x_rank; i++) {
+      if (std::find(cleaned_axis.begin(), cleaned_axis.end(), i) ==
+          cleaned_axis.end()) {
+        out_dim.push_back(x_dim[i]);
+      } else if (keep_dim) {
+        out_dim.push_back(1);
+      }
+    }
+  }
+
+  out->set_dims(make_ddim(out_dim));
+}
+
 void NormInferMeta(const MetaTensor& x,
                    int axis,
                    float epsilon,
@@ -1918,6 +1977,55 @@ void RollInferMeta(const MetaTensor& x,
   out->set_dtype(x.dtype());
 }
 
+void RReluInferMeta(const MetaTensor& x,
+                    float lower,
+                    float upper,
+                    bool is_test,
+                    MetaTensor* out,
+                    MetaTensor* noise) {
+  auto x_dims = x.dims();
+  PADDLE_ENFORCE_GE(lower,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The lower value should be greater than or equal to 0. "
+                        "But received lower value = %f.",
+                        lower));
+  PADDLE_ENFORCE_LE(upper,
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The upper value should be less than or equal to 1. "
+                        "But received upper value = %f.",
+                        upper));
+  PADDLE_ENFORCE_GE(
+      upper,
+      lower,
+      phi::errors::InvalidArgument(
+          "The upper value should be greater than or equal to lower value "
+          "But received upper value = %f, lower value = %f.",
+          upper,
+          lower));
+
+  out->set_dims(x_dims);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+  out->share_lod(x);
+
+  if (noise != nullptr) {
+    noise->set_dims(x_dims);
+    noise->set_dtype(x.dtype());
+    noise->set_layout(x.layout());
+  }
+}
+
+void RReluGradInferMeta(const MetaTensor& out_grad,
+                        const MetaTensor& noise,
+                        MetaTensor* x_grad) {
+  auto do_dims = out_grad.dims();
+  x_grad->set_dims(do_dims);
+  x_grad->set_dtype(out_grad.dtype());
+  x_grad->share_lod(out_grad);
+}
+
 void SetValueInferMeta(const MetaTensor& x, MetaTensor* out) {
   auto in_dims = x.dims();
   PADDLE_ENFORCE_LT(
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index 25ea003f58fd9..a288b9371016f 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -178,6 +178,13 @@ void MultinomialInferMeta(const MetaTensor& x,
                           int num_samples,
                           bool replacement,
                           MetaTensor* out);
+
+void NanmedianInferMeta(const MetaTensor& x,
+                        const IntArray& axes,
+                        bool keep_dim,
+                        MetaTensor* out,
+                        MetaTensor* median_index);
+
 void NormInferMeta(const MetaTensor& x,
                    int axis,
                    float epsilon,
@@ -274,6 +281,17 @@ void RollInferMeta(const MetaTensor& x,
                    const std::vector<int64_t>& axis,
                    MetaTensor* out);
 
+void RReluInferMeta(const MetaTensor& x,
+                    float lower,
+                    float upper,
+                    bool is_test,
+                    MetaTensor* out,
+                    MetaTensor* noise);
+
+void RReluGradInferMeta(const MetaTensor& out_grad,
+                        const MetaTensor& noise,
+                        MetaTensor* x_grad);
+
 void SetValueInferMeta(const MetaTensor& x, MetaTensor* out);
 
 void ShapeInferMeta(const MetaTensor& input, MetaTensor* out);
diff --git a/paddle/phi/kernels/assign_kernel.cc b/paddle/phi/kernels/assign_kernel.cc
index 5ed9d72a503a5..2349bf990acd3 100644
--- a/paddle/phi/kernels/assign_kernel.cc
+++ b/paddle/phi/kernels/assign_kernel.cc
@@ -26,7 +26,7 @@ template <typename Context>
 void AssignKernel(const Context& dev_ctx,
                   const DenseTensor& x,
                   DenseTensor* out) {
-  Copy<Context>(dev_ctx, x, x.place(), false, out);
+  paddle::framework::TensorCopy(x, x.place(), out);
 }
 
 template <typename Context>
diff --git a/paddle/phi/kernels/conv_grad_grad_kernel.h b/paddle/phi/kernels/conv_grad_grad_kernel.h
index 799c8721c3cff..f25cbe384c213 100644
--- a/paddle/phi/kernels/conv_grad_grad_kernel.h
+++ b/paddle/phi/kernels/conv_grad_grad_kernel.h
@@ -40,11 +40,11 @@ void ConvGradGradKernel(const Context& dev_ctx,
 
 template <typename T, typename Context>
 void Conv3DGradGradKernel(const Context& dev_ctx,
-                          const paddle::optional<DenseTensor>& input_grad_grad,
-                          const paddle::optional<DenseTensor>& filter_grad_grad,
-                          const DenseTensor& out_grad,
                           const DenseTensor& input,
                           const DenseTensor& filter,
+                          const DenseTensor& out_grad,
+                          const paddle::optional<DenseTensor>& input_grad_grad,
+                          const paddle::optional<DenseTensor>& filter_grad_grad,
                           const std::vector<int>& strides,
                           const std::vector<int>& paddings,
                           const std::string& paddding_algorithm,
@@ -54,8 +54,8 @@ void Conv3DGradGradKernel(const Context& dev_ctx,
                           bool use_addto,
                           int workspace_size_MB,
                           bool exhaustive_search,
-                          DenseTensor* out_grad_grad,
                           DenseTensor* input_grad,
-                          DenseTensor* filter_grad);
+                          DenseTensor* filter_grad,
+                          DenseTensor* out_grad_grad);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc
index c52f2614150d8..4538ccf9433f9 100644
--- a/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc
@@ -21,11 +21,11 @@
 namespace phi {
 template <typename T, typename Context>
 void Conv3DGradGradKernel(const Context& ctx,
-                          const paddle::optional<DenseTensor>& input_grad_grad,
-                          const paddle::optional<DenseTensor>& filter_grad_grad,
-                          const DenseTensor& out_grad,
                           const DenseTensor& input,
                           const DenseTensor& filter,
+                          const DenseTensor& out_grad,
+                          const paddle::optional<DenseTensor>& input_grad_grad,
+                          const paddle::optional<DenseTensor>& filter_grad_grad,
                           const std::vector<int>& strides,
                           const std::vector<int>& paddings_t,
                           const std::string& padding_algorithm,
@@ -35,9 +35,9 @@ void Conv3DGradGradKernel(const Context& ctx,
                           bool use_addto,
                           int workspace_size_MB,
                           bool exhaustive_search_t,
-                          DenseTensor* out_grad_grad,
                           DenseTensor* input_grad,
-                          DenseTensor* filter_grad) {
+                          DenseTensor* filter_grad,
+                          DenseTensor* out_grad_grad) {
   ConvGradGradKernel<T>(ctx,
                         input,
                         filter,
diff --git a/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc
index 340d2907a7909..867d43fd833de 100644
--- a/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc
@@ -42,10 +42,10 @@ using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
 template <typename T, typename Context>
 void InstanceNormGradKernel(const Context& dev_ctx,
                             const DenseTensor& x,
-                            const DenseTensor& d_y,
                             const paddle::optional<DenseTensor>& scale,
                             const DenseTensor& saved_mean,
                             const DenseTensor& saved_variance,
+                            const DenseTensor& d_y,
                             float epsilon,
                             DenseTensor* d_x,
                             DenseTensor* d_scale,
diff --git a/paddle/phi/kernels/cpu/nanmedian_grad_kernel.cc b/paddle/phi/kernels/cpu/nanmedian_grad_kernel.cc
new file mode 100644
index 0000000000000..f8639a0d10fee
--- /dev/null
+++ b/paddle/phi/kernels/cpu/nanmedian_grad_kernel.cc
@@ -0,0 +1,101 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/nanmedian_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/impl/nanmedian_grad_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CalcMedianGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& median_index,
+                          const DenseTensor& out_grad,
+                          const IntArray& axes,
+                          DenseTensor* x_grad,
+                          T* x_grad_ptr) {
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(dev_ctx, x_grad, static_cast<T>(0));
+  if (!x_grad_ptr) return;
+
+  const int64_t* m_ptr = median_index.data<int64_t>();
+  const T* out_grad_ptr = out_grad.data<T>();
+  int64_t numel = x.numel();
+  auto x_dim = x.dims();
+  int64_t rank = x_dim.size();
+  int64_t stride = x_dim[rank - 1];
+
+  int64_t pre_dim = numel / stride;
+  int64_t i = 0;
+  int64_t offset = 0;
+  T div_factor = static_cast<T>(2.0);
+  for (i = 0; i < pre_dim; i++) {
+    if (m_ptr[2 * i] >= 0) {
+      if (m_ptr[2 * i] == m_ptr[2 * i + 1]) {
+        x_grad_ptr[offset + m_ptr[2 * i]] = out_grad_ptr[i];
+      } else {
+        x_grad_ptr[offset + m_ptr[2 * i]] = out_grad_ptr[i] / div_factor;
+        x_grad_ptr[offset + m_ptr[2 * i + 1]] = out_grad_ptr[i] / div_factor;
+      }
+    }
+    offset += stride;
+  }
+}
+
+template <typename T, typename Context>
+void BaseMedianGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& median_index,
+                          const DenseTensor& out_grad,
+                          const IntArray& axes,
+                          DenseTensor* x_grad) {
+  auto rank = x.dims().size();
+  T* x_grad_ptr = dev_ctx.template Alloc<T>(x_grad);
+  if (axes.size() && (rank > 1)) {
+    DenseTensor tmp_x_grad(*x_grad);
+    CalcMedianGradKernel<T, Context>(
+        dev_ctx, x, median_index, out_grad, axes, &tmp_x_grad, x_grad_ptr);
+    PostprocessMedianGradKernel<T, Context>(dev_ctx, &tmp_x_grad, axes, x_grad);
+  } else {
+    CalcMedianGradKernel<T, Context>(
+        dev_ctx, x, median_index, out_grad, axes, x_grad, x_grad_ptr);
+  }
+}
+
+template <typename T, typename Context>
+void NanmedianGradKernel(const Context& dev_ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& median_index,
+                         const DenseTensor& out_grad,
+                         const IntArray& axes,
+                         bool keep_dim,
+                         DenseTensor* x_grad) {
+  BaseMedianGradKernel<T, Context>(
+      dev_ctx, input, median_index, out_grad, axes, x_grad);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(nanmedian_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::NanmedianGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/nanmedian_kernel.cc b/paddle/phi/kernels/cpu/nanmedian_kernel.cc
new file mode 100644
index 0000000000000..03d7fe304be3e
--- /dev/null
+++ b/paddle/phi/kernels/cpu/nanmedian_kernel.cc
@@ -0,0 +1,210 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/nanmedian_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/nanmedian_kernel_impl.h"
+#include "paddle/phi/kernels/top_k_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CalcMedianFunc(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const std::vector<int64_t>& nan_counts,
+                    bool ignore_nan,
+                    int64_t sort_k,
+                    int64_t stride,
+                    int64_t pre_dim,
+                    T* o_ptr,
+                    int64_t* m_ptr) {
+  bool should_ignore_nan = ignore_nan;
+  DenseTensor sort_out;
+  DenseTensor sort_indices;
+  auto sort_dim = x.dims();
+  int64_t rank = sort_dim.size();
+  sort_dim[rank - 1] = sort_k;
+  sort_out.Resize(sort_dim);
+  sort_indices.Resize(sort_dim);
+
+  dev_ctx.template Alloc<T>(&sort_out);
+  T* sort_out_ptr = sort_out.data<T>();
+  dev_ctx.template Alloc<int64_t>(&sort_indices);
+  int64_t* sort_indices_ptr = sort_indices.data<int64_t>();
+
+  TopkKernel<T, Context>(
+      dev_ctx, x, Scalar(sort_k), -1, false, true, &sort_out, &sort_indices);
+
+  T div_factor = static_cast<T>(2.0);
+  int64_t offset = 0;
+  int64_t i = 0;
+  bool is_ori_odd = stride & 1;
+  if (should_ignore_nan) {
+    for (i = 0; i < pre_dim; i++) {
+      offset = i * sort_k;
+      if (nan_counts[i] == stride) {
+        m_ptr[i * 2] = -1;
+        m_ptr[i * 2 + 1] = -1;
+        o_ptr[i] = sort_out_ptr[offset];
+      } else {
+        int64_t nan_k = nan_counts[i] > 0
+                            ? static_cast<int64_t>(stride - nan_counts[i])
+                            : sort_k;
+        int64_t row_pos = static_cast<int64_t>(nan_k >> 1);
+        int64_t pos = offset + row_pos;
+        if (nan_k & 1) {
+          m_ptr[2 * i] = sort_indices_ptr[pos];
+          m_ptr[2 * i + 1] = sort_indices_ptr[pos];
+          o_ptr[i] = sort_out_ptr[pos];
+        } else {
+          m_ptr[2 * i] =
+              row_pos > 0 ? sort_indices_ptr[pos - 1] : sort_indices_ptr[pos];
+          m_ptr[2 * i + 1] = sort_indices_ptr[pos];
+          T m_val_left =
+              row_pos > 0 ? sort_out_ptr[pos - 1] : sort_out_ptr[pos];
+          T m_val_right = sort_out_ptr[pos];
+          o_ptr[i] = (m_val_left + m_val_right) / div_factor;
+        }
+      }
+    }
+  } else {
+    if (is_ori_odd) {
+      for (i = 0; i < pre_dim; i++) {
+        offset = i * sort_k;
+        int64_t pos = offset + sort_k - 1;
+        o_ptr[i] = sort_out_ptr[pos];
+        m_ptr[2 * i] = sort_indices_ptr[pos];
+        m_ptr[2 * i + 1] = sort_indices_ptr[pos];
+      }
+    } else {
+      for (i = 0; i < pre_dim; i++) {
+        offset = i * sort_k;
+        int64_t pos = offset + sort_k - 1;
+        m_ptr[2 * i] =
+            sort_k > 1 ? sort_indices_ptr[pos - 1] : sort_indices_ptr[pos];
+        m_ptr[2 * i + 1] = sort_indices_ptr[pos];
+        T m_val_left = sort_k > 1 ? sort_out_ptr[pos - 1] : sort_out_ptr[pos];
+        T m_val_right = sort_out_ptr[pos];
+        o_ptr[i] = (m_val_left + m_val_right) / div_factor;
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ProcessMedianKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         T* o_ptr,
+                         int64_t* m_ptr,
+                         bool ignore_nan) {
+  bool should_ignore_nan = ignore_nan;
+  const T* x_ptr = x.data<T>();
+
+  int64_t numel = x.numel();
+  auto x_dim = x.dims();
+  int64_t x_rank = x_dim.size();
+  int64_t stride = x_dim[x_rank - 1];
+  int64_t pre_dim = numel / stride;
+  int64_t i = 0;
+
+  int64_t max_valid_num = 0;
+  std::vector<int64_t> nan_counts;
+  if (should_ignore_nan) {
+    int64_t total_nan_num = 0;
+    std::vector<T> col_vec;
+    col_vec.reserve(stride);
+    col_vec.resize(stride);
+    nan_counts.clear();
+    nan_counts.reserve(pre_dim);
+    nan_counts.resize(pre_dim);
+    for (int64_t i = 0; i < pre_dim; i++) {
+      col_vec.clear();
+      col_vec.insert(
+          col_vec.begin(), x_ptr + i * stride, x_ptr + (i + 1) * stride);
+      nan_counts[i] =
+          std::count_if(col_vec.begin(), col_vec.end(), [&](const T& val) {
+            return std::isnan(static_cast<float>(val));
+          });
+      total_nan_num += nan_counts[i];
+      if (stride - nan_counts[i] > max_valid_num)
+        max_valid_num = stride - nan_counts[i];
+    }
+    // all elems are nan
+    if (total_nan_num == numel) {
+      for (i = 0; i < pre_dim; i++) {
+        o_ptr[i] = x_ptr[0];
+        m_ptr[2 * i] = -1;
+        m_ptr[2 * i + 1] = -1;
+      }
+      return;
+    }
+    should_ignore_nan = total_nan_num > 0;
+  }
+
+  int64_t sort_k = should_ignore_nan ? max_valid_num : ((stride >> 1) + 1);
+  CalcMedianFunc<T, Context>(dev_ctx,
+                             x,
+                             nan_counts,
+                             should_ignore_nan,
+                             sort_k,
+                             stride,
+                             pre_dim,
+                             o_ptr,
+                             m_ptr);
+}
+
+template <typename T, typename Context>
+void BaseMedianKernel(const Context& dev_ctx,
+                      const DenseTensor& input,
+                      const IntArray& axes,
+                      DenseTensor* out,
+                      DenseTensor* median_index,
+                      bool ignore_nan) {
+  DenseTensor x;
+  auto rank = input.dims().size();
+  if ((axes.size() == 0) || rank <= 1) {
+    x = input;
+    x.Resize({input.numel()});
+  } else {
+    PreprocessMedianKernel<T, Context>(dev_ctx, input, axes, &x);
+  }
+
+  T* o_ptr = dev_ctx.template Alloc<T>(out);
+  int64_t* m_ptr = dev_ctx.template Alloc<int64_t>(median_index);
+  ProcessMedianKernel<T, Context>(dev_ctx, x, o_ptr, m_ptr, ignore_nan);
+  out->Resize(out->dims());
+}
+
+template <typename T, typename Context>
+void NanmedianKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const IntArray& axes,
+                     bool keepdim,
+                     DenseTensor* out,
+                     DenseTensor* median_index) {
+  BaseMedianKernel<T, Context>(dev_ctx, x, axes, out, median_index, true);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(nanmedian,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::NanmedianKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/rrelu_grad_kernel.cc b/paddle/phi/kernels/cpu/rrelu_grad_kernel.cc
new file mode 100644
index 0000000000000..10b6c6b1a3ea8
--- /dev/null
+++ b/paddle/phi/kernels/cpu/rrelu_grad_kernel.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/rrelu_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RReluGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& noise,
+                     const DenseTensor& out_grad,
+                     DenseTensor* x_grad) {
+  const T* n_ptr = noise.data<T>();
+  const T* x_ptr = x.data<T>();
+  const T* out_grad_ptr = out_grad.data<T>();
+  int numel = x.numel();
+  if (!x_grad) return;
+
+  int i = 0;
+  T* x_grad_ptr = dev_ctx.template Alloc<T>(x_grad);
+  for (i = 0; i < numel; i++) {
+    x_grad_ptr[i] = x_ptr[i] > 0 ? out_grad_ptr[i] : n_ptr[i] * out_grad_ptr[i];
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    rrelu_grad, CPU, ALL_LAYOUT, phi::RReluGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/rrelu_kernel.cc b/paddle/phi/kernels/cpu/rrelu_kernel.cc
new file mode 100644
index 0000000000000..4c6e30beddfa3
--- /dev/null
+++ b/paddle/phi/kernels/cpu/rrelu_kernel.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/rrelu_kernel.h"
+
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RReluKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const float lower,
+                 const float upper,
+                 bool is_test,
+                 DenseTensor* out,
+                 DenseTensor* noise) {
+  const T* x_ptr = x.data<T>();
+  T* o_ptr = dev_ctx.template Alloc<T>(out);
+  T* n_ptr = dev_ctx.template Alloc<T>(noise);
+  T zero = static_cast<T>(0);
+  int numel = x.numel();
+  int i = 0;
+
+  if (is_test) {
+    T mid_val = static_cast<T>((lower + upper) / 2.0);
+    for (i = 0; i < numel; i++) {
+      if (x_ptr[i] < zero) {
+        o_ptr[i] = mid_val * x_ptr[i];
+        n_ptr[i] = mid_val;
+      } else {
+        o_ptr[i] = x_ptr[i];
+        n_ptr[i] = 1.0;
+      }
+    }
+
+    return;
+  }
+
+  auto engine = paddle::framework::GetCPURandomEngine(0);
+
+  std::uniform_real_distribution<float> dist(lower, upper);
+
+  for (i = 0; i < numel; i++) {
+    if (x_ptr[i] < zero) {
+      T scale = static_cast<T>(dist(*engine));
+      o_ptr[i] = scale * x_ptr[i];
+      n_ptr[i] = scale;
+    } else {
+      o_ptr[i] = x_ptr[i];
+      n_ptr[i] = 1.0;
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(rrelu,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::RReluKernel,
+                   float,
+                   phi::dtype::float16,
+                   double) {}
diff --git a/paddle/phi/kernels/funcs/eigen/broadcast.cc b/paddle/phi/kernels/funcs/eigen/broadcast.cc
index 3459d7acd6baf..008c51249f249 100644
--- a/paddle/phi/kernels/funcs/eigen/broadcast.cc
+++ b/paddle/phi/kernels/funcs/eigen/broadcast.cc
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 
@@ -73,6 +74,7 @@ struct EigenBroadcastGrad<Eigen::DefaultDevice, T, Rank> {
   template struct FUNCTOR<Eigen::DefaultDevice, T, 6>
 INSTANTIATION(EigenBroadcast, bool);
 INSTANTIATION(EigenBroadcast, dtype::float16);
+INSTANTIATION(EigenBroadcast, dtype::bfloat16);
 INSTANTIATION(EigenBroadcast, float);
 INSTANTIATION(EigenBroadcast, double);
 INSTANTIATION(EigenBroadcast, int);
diff --git a/paddle/phi/kernels/funcs/eigen/broadcast.cu b/paddle/phi/kernels/funcs/eigen/broadcast.cu
index d9de69ec55e8b..742081a30c1a0 100644
--- a/paddle/phi/kernels/funcs/eigen/broadcast.cu
+++ b/paddle/phi/kernels/funcs/eigen/broadcast.cu
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 
@@ -73,6 +74,7 @@ struct EigenBroadcastGrad<Eigen::GpuDevice, T, Rank> {
   template struct FUNCTOR<Eigen::GpuDevice, T, 6>
 INSTANTIATION(EigenBroadcast, bool);
 INSTANTIATION(EigenBroadcast, dtype::float16);
+INSTANTIATION(EigenBroadcast, dtype::bfloat16);
 INSTANTIATION(EigenBroadcast, float);
 INSTANTIATION(EigenBroadcast, double);
 INSTANTIATION(EigenBroadcast, int);
diff --git a/paddle/phi/kernels/funcs/values_vectors_functor.h b/paddle/phi/kernels/funcs/values_vectors_functor.h
index 336e9c809427c..a6a6d4097030b 100644
--- a/paddle/phi/kernels/funcs/values_vectors_functor.h
+++ b/paddle/phi/kernels/funcs/values_vectors_functor.h
@@ -27,10 +27,10 @@
 namespace phi {
 namespace funcs {
 
-inline int64_t GetBatchSize(phi::DDim dims) {
+inline int64_t GetBatchSize(const phi::DDim &dims) {
   int64_t batch_size = 1;
   auto dim_size = dims.size();
-  for (int i = 0; i < dim_size - 2; i++) {
+  for (int i = 0; i < dim_size - 2; ++i) {
     batch_size *= dims[i];
   }
   return batch_size;
@@ -54,6 +54,24 @@ static void CheckEighResult(const int batch, const int info) {
           info));
 }
 
+#ifdef PADDLE_WITH_CUDA
+static void CheckEighResult(const GPUContext &dev_ctx,
+                            const int64_t batch_size,
+                            int *info) {
+  std::vector<int> error_info(batch_size);
+  paddle::memory::Copy(phi::CPUPlace(),
+                       error_info.data(),
+                       dev_ctx.GetPlace(),
+                       info,
+                       sizeof(int) * batch_size,
+                       dev_ctx.stream());
+  dev_ctx.Wait();
+  for (auto i = 0; i < batch_size; ++i) {
+    CheckEighResult(i, error_info[i]);
+  }
+}
+#endif
+
 template <typename DeviceContext, typename T>
 struct MatrixEighFunctor {
   void operator()(const DeviceContext &dev_ctx,
@@ -95,7 +113,8 @@ struct MatrixEighFunctor<CPUContext, T> {
     char jobz = has_vectors ? 'V' : 'N';
     int n = dims[dim_size - 1];
     int64_t lda = std::max<int64_t>(1, n);
-    // if work = -1, it means that you need to use the lapack function to query
+    // if work = -1, it means that you need to use the lapack function to
+    // query
     // the optimal value
     int lwork = -1;      // The length of the array work
     int lrwork = -1;     // The dimension of the array rwork,rwork is REAL array
@@ -188,97 +207,92 @@ struct MatrixEighFunctor<GPUContext, T> {
                   bool is_lower,
                   bool has_vectors) {
     using ValueType = phi::dtype::Real<T>;
-    ValueType *out_value = dev_ctx.template Alloc<ValueType>(eigen_values);
 
-    DenseTensor input_trans;
-    input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input);
-    T *input_vector = input_trans.data<T>();
+    int workspace_size = 0;
     auto &dims = input.dims();
     int dim_size = dims.size();
     int64_t batch_size = GetBatchSize(dims);
+    int last_dim = dims[dim_size - 1];
+    int lda = std::max<int>(1, last_dim);
+    auto vector_stride = dims[dim_size - 1] * dims[dim_size - 2];
+    auto values_stride = dims[dim_size - 1];
 
     cublasFillMode_t uplo =
         is_lower ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
     cusolverEigMode_t jobz =
         has_vectors ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR;
 
-    int n = dims[dim_size - 1];
-    int lda = std::max<int>(1, n);
-    auto vector_stride = dims[dim_size - 1] * dims[dim_size - 2];
-    auto values_stride = dims[dim_size - 1];
-    int lwork = 0;
+    ValueType *out_value = dev_ctx.template Alloc<ValueType>(eigen_values);
     auto info = paddle::memory::Alloc(dev_ctx, sizeof(int) * batch_size);
     auto *info_ptr = reinterpret_cast<int *>(info->ptr());
 
-    // When the input type is float32, and the feature value input dimension
-    // is greater than or equal to [*,32,32]  and less than or equal to
-    // [*,512,512], Syevj has better performance.
+    DenseTensor input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input);
+    T *input_vector = input_trans.data<T>();
+
+    // Once input data type is float32, and the last dimension of
+    // input is located in range [32, 512], Syevj works better.
     bool use_syevj = (input.dtype() == phi::DataType::FLOAT32 &&
                       values_stride >= 32 && values_stride <= 512);
+    auto handle = dev_ctx.cusolver_dn_handle();
+
     syevjInfo_t syevj_params;
     if (use_syevj) {
       PADDLE_ENFORCE_GPU_SUCCESS(
           dynload::cusolverDnCreateSyevjInfo(&syevj_params));
+
       PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevj_bufferSize(
           dev_ctx.cusolver_dn_handle(),
           jobz,
           uplo,
-          n,
+          last_dim,
           reinterpret_cast<const float *>(input_vector),
           lda,
           reinterpret_cast<const float *>(out_value),
-          &lwork,
+          &workspace_size,
           syevj_params));
     } else {
       EvdBuffer(dev_ctx.cusolver_dn_handle(),
                 jobz,
                 uplo,
-                n,
+                last_dim,
                 input_vector,
                 lda,
                 out_value,
-                &lwork);
+                &workspace_size);
     }
-    auto work = paddle::memory::Alloc(dev_ctx, sizeof(T) * lwork);
+    auto work = paddle::memory::Alloc(dev_ctx, sizeof(T) * workspace_size);
     auto *work_ptr = reinterpret_cast<T *>(work->ptr());
-    for (auto i = 0; i < batch_size; i++) {
+
+    for (auto i = 0; i < batch_size; ++i) {
       auto *input_data = input_vector + i * vector_stride;
       auto *value_data = out_value + i * values_stride;
-      auto handle = dev_ctx.cusolver_dn_handle();
       if (use_syevj) {
         PADDLE_ENFORCE_GPU_SUCCESS(
             dynload::cusolverDnSsyevj(handle,
                                       jobz,
                                       uplo,
-                                      n,
+                                      last_dim,
                                       reinterpret_cast<float *>(input_data),
                                       lda,
                                       reinterpret_cast<float *>(value_data),
                                       reinterpret_cast<float *>(work_ptr),
-                                      lwork,
-                                      info_ptr,
+                                      workspace_size,
+                                      &info_ptr[i],
                                       syevj_params));
       } else {
         Evd(handle,
             jobz,
             uplo,
-            n,
+            last_dim,
             input_data,
             lda,
             value_data,
             work_ptr,
-            lwork,
-            info_ptr);
+            workspace_size,
+            &info_ptr[i]);
       }
-      int error_info = 0;
-      paddle::memory::Copy(phi::CPUPlace(),
-                           &error_info,
-                           dev_ctx.GetPlace(),
-                           info_ptr,
-                           sizeof(int),
-                           dev_ctx.stream());
-      CheckEighResult(i, error_info);
     }
+    CheckEighResult(dev_ctx, batch_size, info_ptr);
 
     if (use_syevj) {
       PADDLE_ENFORCE_GPU_SUCCESS(
diff --git a/paddle/phi/kernels/gpu/einsum_grad_kernel.cu b/paddle/phi/kernels/gpu/einsum_grad_kernel.cu
index 6ca8dbd9205d8..950f811475c99 100644
--- a/paddle/phi/kernels/gpu/einsum_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/einsum_grad_kernel.cu
@@ -24,4 +24,5 @@ PD_REGISTER_KERNEL(einsum_grad,
                    phi::EinsumGradKernel,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
index b72acc7073383..b2c2df2d3f055 100644
--- a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
@@ -290,10 +290,10 @@ __global__ void DoubleGradComputeDScale(const T *x,
 template <typename T, typename Context>
 void InstanceNormGradKernel(const Context &dev_ctx,
                             const DenseTensor &x,
-                            const DenseTensor &d_y,
                             const paddle::optional<DenseTensor> &scale,
                             const DenseTensor &saved_mean,
                             const DenseTensor &saved_variance,
+                            const DenseTensor &d_y,
                             float epsilon_f,
                             DenseTensor *d_x,
                             DenseTensor *d_scale,
diff --git a/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu b/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu
new file mode 100644
index 0000000000000..1661d396641af
--- /dev/null
+++ b/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu
@@ -0,0 +1,124 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/nanmedian_grad_kernel.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/impl/nanmedian_grad_kernel_impl.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+inline int GET_BLOCKS(const int N) {
+  return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
+}
+
+template <typename T>
+__global__ void KernelNanmedianGrad(const T* x_ptr,
+                                    const int64_t* medians_ptr,
+                                    const T* out_grad_ptr,
+                                    T* x_grad_ptr,
+                                    int64_t stride,
+                                    int64_t pre_dim,
+                                    T div_factor) {
+  CUDA_KERNEL_LOOP(index, pre_dim) {
+    int64_t offset = index * stride;
+    if (medians_ptr[2 * index] >= 0) {
+      if (medians_ptr[2 * index] == medians_ptr[2 * index + 1]) {
+        x_grad_ptr[offset + medians_ptr[2 * index]] = out_grad_ptr[index];
+      } else {
+        x_grad_ptr[offset + medians_ptr[2 * index]] =
+            out_grad_ptr[index] / div_factor;
+        x_grad_ptr[offset + medians_ptr[2 * index + 1]] =
+            out_grad_ptr[index] / div_factor;
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void CalcMedianGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& median_index,
+                          const DenseTensor& out_grad,
+                          DenseTensor* x_grad,
+                          T* x_grad_ptr) {
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(dev_ctx, x_grad, static_cast<T>(0));
+
+  auto stream = dev_ctx.stream();
+  const T* x_ptr = x.data<T>();
+  const int64_t* m_ptr = median_index.data<int64_t>();
+  const T* out_grad_ptr = out_grad.data<T>();
+
+  int64_t numel = x.numel();
+  auto x_dim = x.dims();
+  int64_t x_rank = x_dim.size();
+  int64_t stride = x_dim[x_rank - 1];
+  int64_t pre_dim = numel / stride;
+
+  T div_factor = static_cast<T>(2.0);
+  KernelNanmedianGrad<
+      T><<<GET_BLOCKS(pre_dim), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+      x_ptr, m_ptr, out_grad_ptr, x_grad_ptr, stride, pre_dim, div_factor);
+}
+
+template <typename T, typename Context>
+void BaseMedianGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& median_index,
+                          const DenseTensor& out_grad,
+                          const IntArray& axes,
+                          DenseTensor* x_grad) {
+  auto rank = x.dims().size();
+  T* x_grad_ptr = dev_ctx.template Alloc<T>(x_grad);
+  if (axes.size() && (rank > 1)) {
+    DenseTensor tmp_x_grad(*x_grad);
+    CalcMedianGradKernel<T, Context>(
+        dev_ctx, x, median_index, out_grad, &tmp_x_grad, x_grad_ptr);
+    PostprocessMedianGradKernel<T, Context>(dev_ctx, &tmp_x_grad, axes, x_grad);
+  } else {
+    CalcMedianGradKernel<T, Context>(
+        dev_ctx, x, median_index, out_grad, x_grad, x_grad_ptr);
+  }
+}
+
+template <typename T, typename Context>
+void NanmedianGradKernel(const Context& dev_ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& median_index,
+                         const DenseTensor& out_grad,
+                         const IntArray& axes,
+                         bool keep_dim,
+                         DenseTensor* x_grad) {
+  BaseMedianGradKernel<T, Context>(
+      dev_ctx, input, median_index, out_grad, axes, x_grad);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(nanmedian_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::NanmedianGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/nanmedian_kernel.cu b/paddle/phi/kernels/gpu/nanmedian_kernel.cu
new file mode 100644
index 0000000000000..a67d64c257761
--- /dev/null
+++ b/paddle/phi/kernels/gpu/nanmedian_kernel.cu
@@ -0,0 +1,289 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/nanmedian_kernel.h"
+
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/impl/nanmedian_kernel_impl.h"
+#include "paddle/phi/kernels/top_k_kernel.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+inline int GET_BLOCKS(const int N) {
+  return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
+}
+
+template <typename T>
+__global__ void KernelNanCounts(const T* input,
+                                const int numel,
+                                const int64_t pre_dim,
+                                const int64_t stride,
+                                T min_val,
+                                int64_t* nan_total,
+                                int64_t* nan_counts) {
+  extern __shared__ int64_t buf[];
+  for (int i = threadIdx.x; i < pre_dim; i += blockDim.x) {
+    buf[i] = 0;
+    nan_counts[i] = 0;
+  }
+
+  if (threadIdx.x == 0) {
+    nan_total[0] = 0;
+    nan_total[1] = 0;
+  }
+
+  __syncthreads();
+
+  CUDA_KERNEL_LOOP(index, numel) {
+    const T x = input[index];
+    if (isnan(static_cast<float>(x))) {
+      auto bin = static_cast<int64_t>(index / stride);
+      paddle::platform::CudaAtomicAdd(&buf[bin], 1);
+    }
+  }
+  __syncthreads();
+
+  for (int i = threadIdx.x; i < pre_dim; i += blockDim.x) {
+    paddle::platform::CudaAtomicAdd(&nan_counts[i], buf[i]);
+    paddle::platform::CudaAtomicAdd(&nan_total[0], buf[i]);
+    paddle::platform::CudaAtomicMax(&nan_total[1], stride - buf[i]);
+  }
+}
+
+template <typename T>
+__global__ void CalcMedianKernel(const T* sort_out_ptr,
+                                 const int64_t* sort_indices_ptr,
+                                 int64_t* median_val,
+                                 T* output,
+                                 T div_factor,
+                                 const bool is_odd,
+                                 const int64_t pre_dim,
+                                 const int64_t stride) {
+  CUDA_KERNEL_LOOP(index, pre_dim) {
+    int64_t pos = static_cast<int64_t>((index + 1) * stride) - 1;
+    if (is_odd) {
+      median_val[index * 2] = sort_indices_ptr[pos];
+      median_val[index * 2 + 1] = sort_indices_ptr[pos];
+      output[index] = sort_out_ptr[pos];
+    } else {
+      median_val[index * 2] =
+          pos > 0 ? sort_indices_ptr[pos - 1] : sort_indices_ptr[pos];
+      median_val[index * 2 + 1] = sort_indices_ptr[pos];
+      T median_val_left = pos > 0 ? sort_out_ptr[pos - 1] : sort_out_ptr[pos];
+      T median_val_right = sort_out_ptr[pos];
+      output[index] = (median_val_left + median_val_right) / div_factor;
+    }
+  }
+}
+
+template <typename T>
+__global__ void CalcNanmedianKernel(const T* sort_out_ptr,
+                                    const int64_t* sort_indices_ptr,
+                                    int64_t* nan_counts,
+                                    int64_t* median_val,
+                                    T* output,
+                                    const bool is_odd,
+                                    const int64_t pre_dim,
+                                    const int64_t max_valid_num,
+                                    const int64_t stride,
+                                    const T div_factor,
+                                    const T nan_val) {
+  CUDA_KERNEL_LOOP(index, pre_dim) {
+    int64_t pos = static_cast<int64_t>(index * max_valid_num);
+    int64_t nan_cnt = nan_counts[index];
+    if (nan_cnt == stride) {
+      median_val[index * 2] = -1;
+      median_val[index * 2 + 1] = -1;
+      output[index] = nan_val;
+    } else {
+      int64_t nan_k =
+          nan_cnt > 0 ? static_cast<int64_t>(stride - nan_cnt) : max_valid_num;
+      int64_t row_pos = static_cast<int64_t>(nan_k >> 1);
+      pos += row_pos;
+
+      if (nan_k & 1) {
+        median_val[index * 2] = sort_indices_ptr[pos];
+        median_val[index * 2 + 1] = sort_indices_ptr[pos];
+        output[index] = sort_out_ptr[pos];
+      } else {
+        median_val[index * 2] =
+            pos > 0 ? sort_indices_ptr[pos - 1] : sort_indices_ptr[pos];
+        median_val[index * 2 + 1] = sort_indices_ptr[pos];
+        T median_val_left = pos > 0 ? sort_out_ptr[pos - 1] : sort_out_ptr[pos];
+        T median_val_right = sort_out_ptr[pos];
+        output[index] = (median_val_left + median_val_right) / div_factor;
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ProcessMedianKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         bool ignore_nan,
+                         DenseTensor* out,
+                         int64_t* m_ptr) {
+  bool should_ignore_nan = ignore_nan;
+  auto stream = dev_ctx.stream();
+
+  const T* x_ptr = x.data<T>();
+  T* o_ptr = dev_ctx.template Alloc<T>(out);
+  int64_t numel = x.numel();
+  auto x_dim = x.dims();
+  int64_t x_rank = x_dim.size();
+  int64_t stride = x_dim[x_rank - 1];
+  int64_t pre_dim = numel / stride;
+  int64_t i = 0;
+
+  DenseTensor nan_counts, nan_stat;
+  int64_t* nan_counts_ptr;
+  int64_t max_valid_num = 0;
+  if (should_ignore_nan) {
+    nan_counts.Resize(phi::make_ddim({pre_dim}));
+    dev_ctx.template Alloc<int64_t>(&nan_counts);
+    nan_counts_ptr = nan_counts.data<int64_t>();
+    nan_stat.Resize(phi::make_ddim({2}));
+    int64_t* nan_stat_mem = dev_ctx.template Alloc<int64_t>(&nan_stat);
+    int64_t* nan_stat_ptr = nan_stat.data<int64_t>();
+
+    KernelNanCounts<T><<<GET_BLOCKS(numel),
+                         PADDLE_CUDA_NUM_THREADS,
+                         pre_dim * sizeof(int64_t),
+                         stream>>>(x_ptr,
+                                   numel,
+                                   pre_dim,
+                                   stride,
+                                   std::numeric_limits<T>::min(),
+                                   nan_stat_ptr,
+                                   nan_counts_ptr);
+
+    auto nan_stat_mem_cpu =
+        paddle::memory::Alloc(phi::CPUPlace(), sizeof(int64_t) * 2);
+    int64_t* nan_stat_cpu_ptr =
+        reinterpret_cast<int64_t*>(nan_stat_mem_cpu->ptr());
+    paddle::memory::Copy(phi::CPUPlace(),
+                         nan_stat_cpu_ptr,
+                         dev_ctx.GetPlace(),
+                         nan_stat_mem,
+                         sizeof(int64_t) * 2,
+                         stream);
+
+    // all elements are nan values
+    T nan_val = std::numeric_limits<T>::quiet_NaN();
+    if (nan_stat_cpu_ptr[0] == numel) {
+      FullLikeKernel<T, Context>(dev_ctx, x, nan_val, x.dtype(), out);
+      return;
+    }
+
+    should_ignore_nan = nan_stat_cpu_ptr[0] > 0;
+    max_valid_num = nan_stat_cpu_ptr[1];
+  }
+
+  int64_t sort_k = should_ignore_nan ? max_valid_num : ((stride >> 1) + 1);
+  bool is_ori_odd = stride & 1;
+
+  DenseTensor sort_out, sort_indices;
+  auto sort_dim = x.dims();
+  int64_t rank = sort_dim.size();
+  sort_dim[rank - 1] = sort_k;
+  sort_out.Resize(sort_dim);
+  sort_indices.Resize(sort_dim);
+
+  dev_ctx.template Alloc<T>(&sort_out);
+  T* sort_out_ptr = sort_out.data<T>();
+  dev_ctx.template Alloc<int64_t>(&sort_indices);
+  int64_t* sort_indices_ptr = sort_indices.data<int64_t>();
+
+  TopkKernel<T, Context>(
+      dev_ctx, x, Scalar(sort_k), -1, false, true, &sort_out, &sort_indices);
+
+  T div_factor = static_cast<T>(2.0);
+  T nan_val = std::numeric_limits<T>::quiet_NaN();
+  if (should_ignore_nan) {
+    CalcNanmedianKernel<
+        T><<<GET_BLOCKS(pre_dim), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+        sort_out_ptr,
+        sort_indices_ptr,
+        nan_counts_ptr,
+        m_ptr,
+        o_ptr,
+        is_ori_odd,
+        pre_dim,
+        max_valid_num,
+        stride,
+        div_factor,
+        nan_val);
+  } else {
+    CalcMedianKernel<
+        T><<<GET_BLOCKS(pre_dim), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+        sort_out_ptr,
+        sort_indices_ptr,
+        m_ptr,
+        o_ptr,
+        div_factor,
+        is_ori_odd,
+        pre_dim,
+        sort_k);
+  }
+}
+
+template <typename T, typename Context>
+void BaseMedianKernel(const Context& dev_ctx,
+                      const DenseTensor& input,
+                      const IntArray& axes,
+                      bool ignore_nan,
+                      DenseTensor* out,
+                      DenseTensor* median_index) {
+  DenseTensor x;
+  auto rank = input.dims().size();
+  if ((axes.size() == 0) || rank <= 1) {
+    x = input;
+    x.Resize({input.numel()});
+  } else {
+    PreprocessMedianKernel<T, Context>(dev_ctx, input, axes, &x);
+  }
+
+  int64_t* m_ptr = dev_ctx.template Alloc<int64_t>(median_index);
+  ProcessMedianKernel<T, Context>(dev_ctx, x, ignore_nan, out, m_ptr);
+  out->Resize(out->dims());
+}
+
+template <typename T, typename Context>
+void NanmedianKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const IntArray& axes,
+                     bool keepdim,
+                     DenseTensor* out,
+                     DenseTensor* median_index) {
+  BaseMedianKernel<T, Context>(dev_ctx, x, axes, true, out, median_index);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(nanmedian,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::NanmedianKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/rrelu_grad_kernel.cu b/paddle/phi/kernels/gpu/rrelu_grad_kernel.cu
new file mode 100644
index 0000000000000..44dc31ed5d926
--- /dev/null
+++ b/paddle/phi/kernels/gpu/rrelu_grad_kernel.cu
@@ -0,0 +1,86 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/rrelu_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
+#include "paddle/phi/kernels/gpu/prelu_funcs.h"
+#include "paddle/phi/kernels/primitive/functor_primitives.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void RReluOpGradKernel(const T* x_ptr,
+                                  const T* noise_ptr,
+                                  const T* out_grad_ptr,
+                                  T* x_grad_ptr,
+                                  int numel) {
+  CUDA_KERNEL_LOOP(index, numel) {
+    T scale = noise_ptr[index];
+    T x = x_ptr[index];
+    T out_grad = out_grad_ptr[index];
+    T zero = static_cast<T>(0);
+    x_grad_ptr[index] = (x < zero) ? scale * out_grad : out_grad;
+  }
+}
+
+template <typename T>
+class RReluOpGradFunctor {
+ public:
+  void operator()(gpuStream_t stream,
+                  const T* x,
+                  const T* noise,
+                  const T* out_grad,
+                  T* x_grad,
+                  int numel) {
+    RReluOpGradKernel<
+        T><<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0, stream>>>(
+        x, noise, out_grad, x_grad, numel);
+  }
+};
+
+template <typename T, typename Context>
+void RReluGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& noise,
+                     const DenseTensor& out_grad,
+                     DenseTensor* x_grad) {
+  if (!x_grad) return;
+  dev_ctx.template Alloc<T>(x_grad);
+
+  const T* x_ptr = x.data<T>();
+  const T* n_ptr = noise.data<T>();
+  const T* out_grad_ptr = out_grad.data<T>();
+  T* x_grad_ptr = dev_ctx.template Alloc<T>(x_grad);
+
+  int numel = x.numel();
+  auto stream = dev_ctx.stream();
+
+  RReluOpGradFunctor<T> rrelu_grad;
+  rrelu_grad(stream, x_ptr, n_ptr, out_grad_ptr, x_grad_ptr, numel);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(rrelu_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::RReluGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/rrelu_kernel.cu b/paddle/phi/kernels/gpu/rrelu_kernel.cu
new file mode 100644
index 0000000000000..39582d5872a70
--- /dev/null
+++ b/paddle/phi/kernels/gpu/rrelu_kernel.cu
@@ -0,0 +1,112 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/rrelu_kernel.h"
+
+namespace phi {
+
+template <typename T>
+struct RReluTrainCudaFunctor {
+ public:
+  RReluTrainCudaFunctor(const T* in, T* out, T* noise)
+      : in_(in), out_(out), noise_(noise) {
+    zero_ = static_cast<T>(0);
+  }
+
+  __device__ void operator()(int64_t idx) {
+    T x = in_[idx];
+    if (x < zero_) {
+      out_[idx] = noise_[idx] * x;
+    } else {
+      out_[idx] = x;
+      noise_[idx] = 1.0;
+    }
+  }
+
+ private:
+  const T* in_;
+  T* out_;
+  T* noise_;
+  T zero_;
+};
+
+template <typename T>
+struct RReluTestCudaFunctor {
+ public:
+  RReluTestCudaFunctor(const T* in, T* out, T* noise, T mid_val)
+      : in_(in), out_(out), noise_(noise), mid_val_(mid_val) {
+    zero_ = static_cast<T>(0);
+  }
+
+  __device__ void operator()(int64_t idx) {
+    T x = in_[idx];
+    if (x < zero_) {
+      out_[idx] = mid_val_ * x;
+      noise_[idx] = mid_val_;
+    } else {
+      out_[idx] = x;
+      noise_[idx] = 1.0;
+    }
+  }
+
+ private:
+  const T* in_;
+  T* out_;
+  T* noise_;
+  T zero_;
+  T mid_val_;
+};
+
+template <typename T, typename Context>
+void RReluKernel(const Context& ctx,
+                 const DenseTensor& x,
+                 const float lower,
+                 const float upper,
+                 bool is_test,
+                 DenseTensor* out,
+                 DenseTensor* noise) {
+  const T* x_data = x.data<T>();
+  T* out_data = ctx.template Alloc<T>(out);
+  T* noise_data = ctx.template Alloc<T>(noise);
+  auto size = x.numel();
+  if (size <= 0) return;
+
+  phi::funcs::ForRange<Context> for_range(ctx, size);
+  if (is_test) {
+    T mid_val = static_cast<T>((lower + upper) / 2.0);
+    RReluTestCudaFunctor<T> functor(x_data, out_data, noise_data, mid_val);
+    for_range(functor);
+  } else {
+    using MT = typename kps::details::MPTypeTrait<T>::Type;
+    funcs::uniform_distribution<MT> dist;
+    funcs::uniform_real_transform<MT> trans(lower, upper);
+    funcs::distribution_and_transform<T>(ctx, noise, dist, trans);
+    RReluTrainCudaFunctor<T> functor(x_data, out_data, noise_data);
+    for_range(functor);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(rrelu,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::RReluKernel,
+                   float,
+                   phi::dtype::float16,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/tile_kernel.cu b/paddle/phi/kernels/gpu/tile_kernel.cu
index 0c3c29e82c42a..990877a8445cb 100644
--- a/paddle/phi/kernels/gpu/tile_kernel.cu
+++ b/paddle/phi/kernels/gpu/tile_kernel.cu
@@ -27,4 +27,5 @@ PD_REGISTER_KERNEL(tile,
                    double,
                    int,
                    int64_t,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
index b396e8fa6b0eb..53e4c39d8bcee 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
@@ -711,11 +711,11 @@ void DepthwiseConvCudnnGradGradKernel(
 template <typename T, typename Context>
 void Conv3DCudnnGradGradKernel(
     const Context& ctx,
-    const paddle::optional<DenseTensor>& input_grad_grad,
-    const paddle::optional<DenseTensor>& filter_grad_grad,
-    const DenseTensor& out_grad,
     const DenseTensor& input,
     const DenseTensor& filter,
+    const DenseTensor& out_grad,
+    const paddle::optional<DenseTensor>& input_grad_grad,
+    const paddle::optional<DenseTensor>& filter_grad_grad,
     const std::vector<int>& strides,
     const std::vector<int>& paddings_t,
     const std::string& padding_algorithm,
@@ -725,9 +725,9 @@ void Conv3DCudnnGradGradKernel(
     bool use_addto,
     int workspace_size_MB,
     bool exhaustive_search_t,
-    DenseTensor* out_grad_grad,
     DenseTensor* input_grad,
-    DenseTensor* filter_grad) {
+    DenseTensor* filter_grad,
+    DenseTensor* out_grad_grad) {
   ConvCudnnGradGradKernel<T>(ctx,
                              input,
                              filter,
diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
index 77159bfc876da..58781e8c6e491 100644
--- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
+++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
@@ -888,19 +888,6 @@ void SoftmaxBackwardCudnnKernel(const GPUContext& dev_ctx,
 #endif
 }
 
-template <typename T>
-static bool CanUseCudnnSoftmax(const GPUContext& dev_ctx) {
-  if (dev_ctx.cudnn_handle() != nullptr) {
-    if (std::is_same<T, phi::dtype::bfloat16>::value) {
-#if CUDNN_VERSION < 8100
-      return false;
-#endif
-    }
-    return true;
-  }
-  return false;
-}
-
 #if CUDNN_VERSION < 8100
 template <>
 inline void SoftmaxForwardCudnnKernel<phi::dtype::bfloat16>(
@@ -927,6 +914,25 @@ inline void SoftmaxBackwardCudnnKernel<phi::dtype::bfloat16>(
 }
 #endif
 
+template <typename T>
+bool UseCudnnSoftmax(const GPUContext& ctx, int softmax_dim, bool last_dim) {
+  bool cudnn_available = ctx.cudnn_handle();
+  if (!ctx.cudnn_handle()) {
+    if (std::is_same<T, phi::dtype::bfloat16>::value) {
+#if CUDNN_VERSION < 8100
+      cudnn_available = false;
+#endif
+    }
+  }
+  constexpr int max_dim = 512;
+  if (!cudnn_available || !last_dim ||
+      (softmax_dim <= max_dim && sizeof(T) <= 4)) {
+    return false;
+  } else {
+    return true;
+  }
+}
+
 template <typename T, bool LogMode = false>
 void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                     const DenseTensor& x,
@@ -941,10 +947,7 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
   int dim = tensor_dims[1];
   int D = tensor_dims[2];
 
-  constexpr int max_dim = 512;
-
-  if (D == 1 &&
-      (!CanUseCudnnSoftmax<T>(dev_ctx) || (dim <= max_dim && sizeof(T) <= 4))) {
+  if (D == 1 && !UseCudnnSoftmax<T>(dev_ctx, dim, true)) {
     int dim_log2 = static_cast<int>(Log2Ceil(dim));
     int dim_ceil = 1 << dim_log2;
     int warp_size = (dim_ceil < 32) ? dim_ceil : 32;
@@ -1016,10 +1019,7 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx,
   int dim = tensor_dims[1];
   int D = tensor_dims[2];
 
-  constexpr int max_dim = 512;
-
-  if (D == 1 &&
-      (!CanUseCudnnSoftmax<T>(dev_ctx) || (dim <= max_dim && sizeof(T) <= 4))) {
+  if (D == 1 && !UseCudnnSoftmax<T>(dev_ctx, dim, true)) {
     int dim_log2 = Log2Ceil(dim);
     int dim_ceil = 1 << dim_log2;
     int warp_size = (dim_ceil < 32) ? dim_ceil : 32;
diff --git a/paddle/phi/kernels/impl/einsum_grad_impl.h b/paddle/phi/kernels/impl/einsum_grad_impl.h
index aceb97a49b1c2..a72db326807f8 100644
--- a/paddle/phi/kernels/impl/einsum_grad_impl.h
+++ b/paddle/phi/kernels/impl/einsum_grad_impl.h
@@ -197,20 +197,24 @@ void EinsumGradKernel(const Context& dev_ctx,
     // release the cache tensor dTC to save memory right now. they are useless
     // now.
     cache.clear();
-    *(x_grad[0]) = PerformTileAndReduction<T, Context>(dev_ctx,
-                                                       labeltype,
-                                                       labelshape,
-                                                       broadcast_dims,
-                                                       ellipsis_dims[0],
-                                                       ops[0],
-                                                       dA);
-    *(x_grad[1]) = PerformTileAndReduction<T, Context>(dev_ctx,
-                                                       labeltype,
-                                                       labelshape,
-                                                       broadcast_dims,
-                                                       ellipsis_dims[1],
-                                                       ops[1],
-                                                       dB);
+    if (x_grad[0]) {
+      *(x_grad[0]) = PerformTileAndReduction<T, Context>(dev_ctx,
+                                                         labeltype,
+                                                         labelshape,
+                                                         broadcast_dims,
+                                                         ellipsis_dims[0],
+                                                         ops[0],
+                                                         dA);
+    }
+    if (x_grad[1]) {
+      *(x_grad[1]) = PerformTileAndReduction<T, Context>(dev_ctx,
+                                                         labeltype,
+                                                         labelshape,
+                                                         broadcast_dims,
+                                                         ellipsis_dims[1],
+                                                         ops[1],
+                                                         dB);
+    }
   }
 }
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/nanmedian_grad_kernel_impl.h b/paddle/phi/kernels/impl/nanmedian_grad_kernel_impl.h
new file mode 100644
index 0000000000000..f57434127620c
--- /dev/null
+++ b/paddle/phi/kernels/impl/nanmedian_grad_kernel_impl.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/nanmedian_grad_kernel.h"
+
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PostprocessMedianGradKernel(const Context& dev_ctx,
+                                 DenseTensor* input,
+                                 const IntArray& raw_axes,
+                                 DenseTensor* x) {
+  auto input_dim = input->dims();
+  auto rank = input_dim.size();
+
+  std::vector<int64_t> axes = raw_axes.GetData();
+  int64_t axes_size = static_cast<int>(axes.size());
+  for (int64_t i = 0; i < axes_size; i++) {
+    if (axes[i] < 0) {
+      axes[i] += rank;
+    }
+  }
+
+  std::vector<int> trans_back;
+  std::vector<int> reshape_back;
+  trans_back.reserve(rank);
+  trans_back.resize(rank);
+
+  int offset = 0;
+  for (int64_t i = 0; i < rank; i++) {
+    if (std::find(axes.begin(), axes.end(), i) == axes.end()) {
+      reshape_back.push_back(input_dim[i]);
+      trans_back[i] = offset;
+      offset += 1;
+    }
+  }
+
+  for (int64_t i = 0; i < rank; i++) {
+    if (std::find(axes.begin(), axes.end(), i) != axes.end()) {
+      trans_back[i] = offset;
+      reshape_back.push_back(input_dim[i]);
+      offset += 1;
+    }
+  }
+
+  input->Resize(make_ddim(reshape_back));
+  funcs::TransCompute<Context, T>(
+      static_cast<int>(trans_back.size()), dev_ctx, *input, x, trans_back);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/nanmedian_kernel_impl.h b/paddle/phi/kernels/impl/nanmedian_kernel_impl.h
new file mode 100644
index 0000000000000..57e9e5646e559
--- /dev/null
+++ b/paddle/phi/kernels/impl/nanmedian_kernel_impl.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/nanmedian_kernel.h"
+
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PreprocessMedianKernel(const Context& dev_ctx,
+                            const DenseTensor& input,
+                            const IntArray& raw_axes,
+                            DenseTensor* x) {
+  auto input_dim = input.dims();
+  auto rank = input_dim.size();
+  std::vector<int> perm;
+  std::vector<int64_t> reshape;
+
+  std::vector<int64_t> axes = raw_axes.GetData();
+  int64_t axes_size = static_cast<int>(axes.size());
+  for (int64_t i = 0; i < axes_size; i++) {
+    if (axes[i] < 0) {
+      axes[i] += rank;
+    }
+  }
+
+  for (int64_t i = 0; i < rank; i++) {
+    if (std::find(axes.begin(), axes.end(), i) == axes.end()) {
+      perm.push_back(i);
+      reshape.push_back(input_dim[i]);
+    }
+  }
+
+  int64_t post_numel = 1;
+  for (int64_t i = 0; i < rank; i++) {
+    if (std::find(axes.begin(), axes.end(), i) != axes.end()) {
+      perm.push_back(i);
+      post_numel *= input_dim[i];
+    }
+  }
+  reshape.push_back(post_numel);
+
+  DDim trans_dim(input_dim);
+  int ndims = perm.size();
+  for (int i = 0; i < ndims; i++) {
+    trans_dim[i] = input_dim[perm[i]];
+  }
+  x->Resize(trans_dim);
+  dev_ctx.template Alloc<T>(x);
+  funcs::TransCompute<Context, T>(ndims, dev_ctx, input, x, perm);
+
+  x->Resize(make_ddim(reshape));
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/instance_norm_grad_kernel.h b/paddle/phi/kernels/instance_norm_grad_kernel.h
index be7e4ce3e3488..2a661a3fd3853 100644
--- a/paddle/phi/kernels/instance_norm_grad_kernel.h
+++ b/paddle/phi/kernels/instance_norm_grad_kernel.h
@@ -21,10 +21,10 @@ namespace phi {
 template <typename T, typename Context>
 void InstanceNormGradKernel(const Context& dev_ctx,
                             const DenseTensor& x,
-                            const DenseTensor& y_grad,
                             const paddle::optional<DenseTensor>& scale,
                             const DenseTensor& saved_mean,
                             const DenseTensor& saved_variance,
+                            const DenseTensor& y_grad,
                             float epsilon,
                             DenseTensor* x_grad,
                             DenseTensor* scale_grad,
diff --git a/paddle/phi/kernels/nanmedian_grad_kernel.h b/paddle/phi/kernels/nanmedian_grad_kernel.h
new file mode 100644
index 0000000000000..e8fb01b7060a7
--- /dev/null
+++ b/paddle/phi/kernels/nanmedian_grad_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/int_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void NanmedianGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& median_index,
+                         const DenseTensor& out_grad,
+                         const IntArray& axes,
+                         bool keep_dim,
+                         DenseTensor* x_grad);
+}  // namespace phi
diff --git a/paddle/phi/kernels/nanmedian_kernel.h b/paddle/phi/kernels/nanmedian_kernel.h
new file mode 100644
index 0000000000000..4bb382a443144
--- /dev/null
+++ b/paddle/phi/kernels/nanmedian_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/int_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void NanmedianKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const IntArray& axes,
+                     bool keep_dim,
+                     DenseTensor* out,
+                     DenseTensor* medians);
+}  // namespace phi
diff --git a/paddle/phi/kernels/rrelu_grad_kernel.h b/paddle/phi/kernels/rrelu_grad_kernel.h
new file mode 100644
index 0000000000000..b6172fca10e53
--- /dev/null
+++ b/paddle/phi/kernels/rrelu_grad_kernel.h
@@ -0,0 +1,28 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RReluGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& noise,
+                     const DenseTensor& out_grad,
+                     DenseTensor* x_grad);
+}  // namespace phi
diff --git a/paddle/phi/kernels/rrelu_kernel.h b/paddle/phi/kernels/rrelu_kernel.h
new file mode 100644
index 0000000000000..8deb52daaae13
--- /dev/null
+++ b/paddle/phi/kernels/rrelu_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RReluKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const float lower,
+                 const float upper,
+                 bool is_test,
+                 DenseTensor* out,
+                 DenseTensor* noise);
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/conv3d_sig.cc b/paddle/phi/ops/compat/conv3d_sig.cc
index c6aae1bf5bb54..49f31288d00f6 100644
--- a/paddle/phi/ops/compat/conv3d_sig.cc
+++ b/paddle/phi/ops/compat/conv3d_sig.cc
@@ -49,7 +49,7 @@ KernelSignature Conv3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
 KernelSignature Conv3dDoubleGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature("conv3d_grad_grad",
-                         {"DDInput", "DDFilter", "DOutput", "Input", "Filter"},
+                         {"Input", "Filter", "DOutput", "DDInput", "DDFilter"},
                          {"strides",
                           "paddings",
                           "padding_algorithm",
@@ -59,7 +59,7 @@ KernelSignature Conv3dDoubleGradOpArgumentMapping(
                           "use_addto",
                           "workspace_size_MB",
                           "exhaustive_search"},
-                         {"DDOutput", "DInput", "DFilter"});
+                         {"DInput", "DFilter", "DDOutput"});
 }
 
 }  // namespace phi
diff --git a/paddle/phi/ops/compat/instance_norm_sig.cc b/paddle/phi/ops/compat/instance_norm_sig.cc
index 2b490078512b1..6ccf120979887 100644
--- a/paddle/phi/ops/compat/instance_norm_sig.cc
+++ b/paddle/phi/ops/compat/instance_norm_sig.cc
@@ -27,7 +27,7 @@ KernelSignature InstanceNormOpArgumentMapping(
 KernelSignature InstanceNormGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature("instance_norm_grad",
-                         {"X", "Y@GRAD", "Scale", "SavedMean", "SavedVariance"},
+                         {"X", "Scale", "SavedMean", "SavedVariance", "Y@GRAD"},
                          {"epsilon"},
                          {"X@GRAD", "Scale@GRAD", "Bias@GRAD"});
 }
diff --git a/paddle/phi/ops/compat/nanmedian_sig.cc b/paddle/phi/ops/compat/nanmedian_sig.cc
new file mode 100644
index 0000000000000..5ca0d450e3b41
--- /dev/null
+++ b/paddle/phi/ops/compat/nanmedian_sig.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature NanmedianOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "nanmedian", {"X"}, {"axis", "keepdim"}, {"Out", "MedianIndex"});
+}
+
+KernelSignature NanmedianGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("nanmedian_grad",
+                         {"X", "MedianIndex", "Out@GRAD"},
+                         {"axis", "keepdim"},
+                         {"X@GRAD"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(nanmedian, phi::NanmedianOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(nanmedian_grad, phi::NanmedianGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/rrelu_sig.cc b/paddle/phi/ops/compat/rrelu_sig.cc
new file mode 100644
index 0000000000000..00cd705a24076
--- /dev/null
+++ b/paddle/phi/ops/compat/rrelu_sig.cc
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature RReluOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "rrelu", {"X"}, {"lower", "upper", "is_test"}, {"Out", "Noise"});
+}
+
+KernelSignature RReluGradGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "rrelu_grad", {"X", "Noise", "Out@GRAD"}, {}, {"X@GRAD"});
+}
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(rrelu, phi::RReluOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(rrelu_grad, phi::RReluGradGradOpArgumentMapping);
diff --git a/paddle/phi/tests/api/CMakeLists.txt b/paddle/phi/tests/api/CMakeLists.txt
index 5c1d0989629dc..2333f82d626c4 100644
--- a/paddle/phi/tests/api/CMakeLists.txt
+++ b/paddle/phi/tests/api/CMakeLists.txt
@@ -15,6 +15,7 @@ cc_test(test_matmul_api SRCS test_matmul_api.cc DEPS ${COMMON_API_TEST_DEPS})
 cc_test(test_empty_api SRCS test_empty_api.cc DEPS ${COMMON_API_TEST_DEPS})
 cc_test(test_fill_api SRCS test_fill_api.cc DEPS ${COMMON_API_TEST_DEPS} api_scalar)
 cc_test(test_elementwise_api SRCS test_elementwise_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_embedding_api SRCS test_embedding_api.cc DEPS ${COMMON_API_TEST_DEPS})
 cc_test(test_cast_api SRCS test_cast_api.cc DEPS ${COMMON_API_TEST_DEPS})
 cc_test(test_reshape_api SRCS test_reshape_api.cc DEPS ${COMMON_API_TEST_DEPS})
 cc_test(test_to_api SRCS test_to_api.cc DEPS ${COMMON_API_TEST_DEPS})
diff --git a/paddle/phi/tests/api/test_embedding_api.cc b/paddle/phi/tests/api/test_embedding_api.cc
new file mode 100644
index 0000000000000..6ccd382786bd1
--- /dev/null
+++ b/paddle/phi/tests/api/test_embedding_api.cc
@@ -0,0 +1,119 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/phi/api/backward/backward_api.h"
+#include "paddle/phi/api/include/api.h"
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(sparse_weight_embedding, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sparse_weight_embedding_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sparse_weight_embedding_sparse_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(empty, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
+namespace paddle {
+namespace tests {
+
+TEST(API, sparse_weight_embedding) {
+  auto x = paddle::experimental::empty({4}, DataType::INT32);
+  auto* x_data = x.data<int32_t>();
+  x_data[0] = 0;
+  x_data[1] = 4;
+  x_data[2] = 3;
+  x_data[3] = 1;
+
+  auto weight_sr = std::make_shared<phi::SelectedRows>(
+      std::vector<int64_t>{0, 1, 2, 3, 4, 5, 6}, 16);
+  *weight_sr->mutable_value() = *static_cast<phi::DenseTensor*>(
+      paddle::experimental::full({7, 3}, 2, DataType::FLOAT32).impl().get());
+  paddle::experimental::Tensor weight;
+  weight.set_impl(weight_sr);
+
+  auto out = paddle::experimental::embedding(x, weight);
+
+  // 3. check result
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 4);
+  ASSERT_EQ(out.numel(), 12);
+  ASSERT_EQ(out.type(), phi::DataType::FLOAT32);
+  ASSERT_EQ(out.layout(), phi::DataLayout::NCHW);
+}
+
+TEST(API, sparse_weight_embedding_grad) {
+  auto x = paddle::experimental::empty({4}, DataType::INT32);
+  auto* x_data = x.data<int32_t>();
+  x_data[0] = 0;
+  x_data[1] = 4;
+  x_data[2] = 3;
+  x_data[3] = 1;
+
+  auto weight_sr = std::make_shared<phi::SelectedRows>(
+      std::vector<int64_t>{0, 1, 2, 3, 4, 5, 6}, 16);
+  *weight_sr->mutable_value() = *static_cast<phi::DenseTensor*>(
+      paddle::experimental::full({7, 3}, 2, DataType::FLOAT32).impl().get());
+  paddle::experimental::Tensor weight;
+  weight.set_impl(weight_sr);
+
+  auto out_grad = paddle::experimental::full({4, 3}, 1, DataType::FLOAT32);
+
+  paddle::experimental::Tensor weight_grad;
+
+  paddle::experimental::embedding_grad(
+      x, weight, out_grad, -1, false, &weight_grad);
+
+  // 3. check result
+  ASSERT_EQ(weight_grad.dims().size(), 2);
+  ASSERT_EQ(weight_grad.dims()[0], 16);
+  ASSERT_EQ(weight_grad.numel(), 48);
+  ASSERT_EQ(weight_grad.type(), phi::DataType::FLOAT32);
+  ASSERT_EQ(weight_grad.layout(), phi::DataLayout::NCHW);
+}
+
+TEST(API, sparse_weight_embedding_sparse_grad) {
+  auto x = paddle::experimental::empty({4}, DataType::INT32);
+  auto* x_data = x.data<int32_t>();
+  x_data[0] = 0;
+  x_data[1] = 4;
+  x_data[2] = 3;
+  x_data[3] = 1;
+
+  auto weight_sr = std::make_shared<phi::SelectedRows>(
+      std::vector<int64_t>{0, 1, 2, 3, 4, 5, 6}, 16);
+  *weight_sr->mutable_value() = *static_cast<phi::DenseTensor*>(
+      paddle::experimental::full({7, 3}, 2, DataType::FLOAT32).impl().get());
+  paddle::experimental::Tensor weight;
+  weight.set_impl(weight_sr);
+
+  auto out_grad = paddle::experimental::full({4, 3}, 1, DataType::FLOAT32);
+
+  paddle::experimental::Tensor weight_grad;
+
+  paddle::experimental::embedding_grad(
+      x, weight, out_grad, -1, true, &weight_grad);
+
+  // 3. check result
+  ASSERT_EQ(weight_grad.dims().size(), 2);
+  ASSERT_EQ(weight_grad.dims()[0], 4);
+  ASSERT_EQ(weight_grad.numel(), 12);
+  ASSERT_EQ(weight_grad.type(), phi::DataType::FLOAT32);
+  ASSERT_EQ(weight_grad.layout(), phi::DataLayout::NCHW);
+}
+
+}  // namespace tests
+}  // namespace paddle
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 0f70f9a8f3564..2a18d2f7e0195 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -85,6 +85,9 @@ if not defined NEW_RELEASE_JIT set NEW_RELEASE_JIT=OFF
 set task_name=%1
 set UPLOAD_TP_FILE=OFF
 
+set error_code=0
+type %cache_dir%\error_code.txt
+
 rem ------initialize set git config------
 git config --global core.longpaths true
 
@@ -118,8 +121,6 @@ if "%WITH_CACHE%"=="OFF" (
     goto :mkbuild
 )
 
-set error_code=0
-type %cache_dir%\error_code.txt
 : set /p error_code=< %cache_dir%\error_code.txt
 if %error_code% NEQ 0 (
     rmdir %BUILD_DIR% /s/q
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index efd2de5621604..b96283636f8fd 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -360,6 +360,10 @@ function check_style() {
     # pre-commit use python3.8.0 
     OLD_PATH=$PATH
     export PATH=/usr/local/python3.8.0/bin:/usr/local/python3.8.0/include:/usr/local/bin:${PATH}
+    
+    if ! [[ $(pre-commit --version) == *"2.17.0"* ]]; then
+        pip install pre-commit==2.17.0
+    fi
 
     pre-commit install
     clang-format --version
@@ -1563,6 +1567,10 @@ set +x
         card_test "$exclusive_tests_medium_parallel" -1 4                  # run cases exclusively, in this cases would be run with 2/4/8 GPUs
         card_test "$exclusive_tests_non_parallel" -1 2                # run cases exclusively, in this cases would be run with 2/4/8 GPUs
         exclu_ut_endTime_s=`date +%s`
+        
+        echo "ipipe_log_param_1_TestCases_Total_Time: $[ $single_ut_endTime_s - $single_ut_startTime_s ]s" 
+        echo "ipipe_log_param_2_TestCases_Total_Time: $[ $multi_ut_endTime_s - $multi_ut_startTime_s ]s" 
+        echo "ipipe_log_param_Exclusive_TestCases_Total_Time: $[ $exclu_ut_endTime_s - $exclu_ut_startTime_s ]s" 
 
         echo "ipipe_log_param_1_TestCases_Total_Time: $[ $single_ut_endTime_s - $single_ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
         echo "ipipe_log_param_2_TestCases_Total_Time: $[ $multi_ut_endTime_s - $multi_ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
@@ -1684,6 +1692,68 @@ set -ex
     fi
 }
 
+function classify_case_by_cardNum() {
+    cd ${PADDLE_ROOT}/build
+    test_cases=$(ctest -N -V) # get all test cases
+    single_card_tests='^job$'                           # all cases list which would take single GPU
+    multiple_card_tests='^job$'
+    exclusive_card_tests='^job$'
+    nightly_tests='^job$'
+
+    is_exclusive=''           # indicate whether the case is exclusive type
+    is_multicard=''           # indicate whether the case is multiple GPUs type
+    is_nightly=''             # indicate whether the case will only run at night
+set +x
+    while read -r line; do
+        if [[ "$line" == "" ]]; then
+            continue
+        fi
+            read matchstr <<< $(echo "$line"|grep -oEi 'Test[ \t]+#')
+            if [[ "$matchstr" == "" ]]; then
+                # Any test case with LABELS property would be parse here
+                # RUN_TYPE=EXCLUSIVE mean the case would run exclusively
+                # RUN_TYPE=DIST mean the case would take two graph GPUs during runtime
+                # RUN_TYPE=NIGHTLY or RUN_TYPE=DIST:NIGHTLY or RUN_TYPE=EXCLUSIVE:NIGHTLY means the case will ONLY run at night
+                read is_exclusive <<< $(echo "$line"|grep -oEi "RUN_TYPE=EXCLUSIVE")
+                read is_multicard <<< $(echo "$line"|grep -oEi "RUN_TYPE=DIST")
+                read is_nightly <<< $(echo "$line"|grep -oEi "RUN_TYPE=NIGHTLY|RUN_TYPE=DIST:NIGHTLY|RUN_TYPE=EXCLUSIVE:NIGHTLY")
+                continue
+            fi
+            read testcase <<< $(echo "$line"|grep -oEi "\w+$")
+
+            if [[ "$is_nightly" != "" ]] && [ ${NIGHTLY_MODE:-OFF} == "OFF" ]; then
+                echo $testcase" will only run at night."
+                nightly_tests="$nightly_tests|^$testcase$" 
+                echo "$testcase" >> ${PADDLE_ROOT}/build/nightly_case
+                continue
+            fi
+
+            if [[ "$is_multicard" == "" ]]; then
+                # trick: treat all test case with prefix "test_dist" as dist case, and would run on 2 GPUs
+                read is_multicard <<< $(echo "$testcase"|grep -oEi "test_dist_")
+            fi
+            if [[ "$is_exclusive" != "" ]]; then
+                exclusive_card_tests="$exclusive_card_tests|^$testcase$"
+            elif [[ "$is_multicard" != "" ]]; then
+                multiple_card_tests="$multiple_card_tests|^$testcase$"
+            else
+                single_card_tests="$single_card_tests|^$testcase$"
+            fi
+            is_exclusive=''
+            is_multicard=''
+            is_nightly=''
+            matchstr=''
+            testcase=''
+    done <<< "$test_cases"; 
+set -x
+    rm -rf ${PADDLE_ROOT}/build/classify_case_by_cardNum.txt
+    touch ${PADDLE_ROOT}/build/classify_case_by_cardNum.txt
+    echo 'single_card_tests: '$single_card_tests >> ${PADDLE_ROOT}/build/classify_case_by_cardNum.txt
+    echo 'multiple_card_tests: '$multiple_card_tests >> ${PADDLE_ROOT}/build/classify_case_by_cardNum.txt
+    echo 'exclusive_card_tests: '$exclusive_card_tests >> ${PADDLE_ROOT}/build/classify_case_by_cardNum.txt
+    echo 'nightly_tests: '$nightly_tests >> ${PADDLE_ROOT}/build/classify_case_by_cardNum.txt
+}
+
 function show_ut_retry_result() {
     if [ "$SYSTEM" == "Darwin" ]; then
         exec_retry_threshold_count=10
@@ -1921,8 +1991,15 @@ set -x
     #generate ut file map
     python ${PADDLE_ROOT}/tools/get_ut_file_map.py 'get_ut_map' ${PADDLE_ROOT}
 
+
+    wait;
+    #classify_case_by_cardNum
+    classify_case_by_cardNum    
+    
     #generate ut mem map
-    python ${PADDLE_ROOT}/tools/get_ut_mem_map.py $tmp_dir 
+    python ${PADDLE_ROOT}/tools/get_ut_mem_map.py $tmp_dir
+    python ${PADDLE_ROOT}/tools/final_ut_parallel_rule.py ${PADDLE_ROOT}
+    
 }
 
 function get_failedUts_precise_map_file {
@@ -2288,6 +2365,220 @@ set -ex
     fi   
 }
 
+function parallel_test_base_gpu_test() {
+    if [ ${WITH_TESTING:-ON} == "ON" ] ; then
+    cat <<EOF
+    ========================================
+    Running unit tests in parallel way ...
+    ========================================
+EOF
+
+
+set -x
+        # set trt_convert ut to run 15% cases.
+        export TEST_NUM_PERCENT_CASES=0.15
+        precison_cases=""
+        bash $PADDLE_ROOT/tools/check_added_ut.sh
+        if [ ${PRECISION_TEST:-OFF} == "ON" ]; then
+            python3.7 $PADDLE_ROOT/tools/get_pr_ut.py
+        fi
+        if [ -a "$PADDLE_ROOT/duplicate_ut" ];then
+            duplicate_uts=$(cat $PADDLE_ROOT/duplicate_ut|sed -e 's/\r//g')
+            if [[ "$duplicate_uts" != "" ]];then
+                set +x
+                echo "========================================"
+                echo "The new unit test has the same name as the existing unit test"
+                cat "$PADDLE_ROOT/duplicate_ut"
+                echo "========================================"
+                exit 102;
+                set -x
+            fi
+        fi
+        if [ -a "$PADDLE_ROOT/added_ut" ];then
+            added_uts=^$(awk BEGIN{RS=EOF}'{gsub(/\n/,"$|^");print}' $PADDLE_ROOT/added_ut)$
+            env CUDA_VISIBLE_DEVICES=0 ctest -R "(${added_uts})" -LE "RUN_TYPE=DIST|RUN_TYPE=EXCLUSIVE" --output-on-failure --repeat-until-fail 3 --timeout 15;added_ut_error=$?
+            ctest -R "(${added_uts})" -L "RUN_TYPE=DIST|RUN_TYPE=EXCLUSIVE" --output-on-failure --repeat-until-fail 3 --timeout 15;added_ut_error_1=$?
+            if [ "$added_ut_error" != 0 ] && [ "$added_ut_error_1" != 0 ];then
+                echo "========================================"
+                echo "Added UT should not exceed 15 seconds"
+                echo "========================================"
+                exit 8;
+            fi
+        fi
+set +x
+        EXIT_CODE=0;
+        wget --no-proxy https://paddle-docker-tar.bj.bcebos.com/pre_test/CTestCostData.txt --no-check-certificate
+        mkdir -p ${PADDLE_ROOT}/build/Testing/Temporary/
+        cp -r ${PADDLE_ROOT}/build/CTestCostData.txt ${PADDLE_ROOT}/build/Testing/Temporary/
+        
+        ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d' > all_ut_list
+        get_quickly_disable_ut||disable_ut_quickly='disable_ut'    # indicate whether the case was in quickly disable list
+        test_cases=$(ctest -N -V) # get all test cases
+
+        python ${PADDLE_ROOT}/tools/group_case_for_parallel.py ${PADDLE_ROOT}
+
+        single_ut_mem_0_startTime_s=`date +%s`
+        while read line
+        do
+            card_test "$line" 1 4
+        done < $PADDLE_ROOT/tools/single_card_tests_mem0_new
+        single_ut_mem_0_endTime_s=`date +%s`
+        echo "ipipe_log_param_1_mem_0_TestCases_Total_Time: $[ $single_ut_mem_0_endTime_s - $single_ut_mem_0_startTime_s ]s" 
+        echo "ipipe_log_param_1_mem_0_TestCases_Total_Time: $[ $single_ut_mem_0_endTime_s - $single_ut_mem_0_startTime_s ]s"  >> ${PADDLE_ROOT}/build/build_summary.txt
+
+        single_ut_startTime_s=`date +%s`
+        while read line
+        do
+            num=$[(`echo $line | awk -F"$" '{print NF-1}'`-1)/6]
+            if [ $num -eq 0 ]; then
+                num=1
+            fi
+            card_test "$line" 1 $num
+        done < $PADDLE_ROOT/tools/single_card_tests_new
+        single_ut_endTime_s=`date +%s`
+        echo "ipipe_log_param_1_TestCases_Total_Time: $[ $single_ut_endTime_s - $single_ut_startTime_s ]s" 
+        echo "ipipe_log_param_1_TestCases_Total_Time: $[ $single_ut_endTime_s - $single_ut_startTime_s ]s"   >> ${PADDLE_ROOT}/build/build_summary.txt
+
+        multiple_ut_mem_0_startTime_s=`date +%s`
+        while read line
+        do
+            card_test "$line" 2 4
+        done < $PADDLE_ROOT/tools/multiple_card_tests_mem0_new
+        multiple_ut_mem_0_endTime_s=`date +%s`
+        echo "ipipe_log_param_2_mem0_TestCases_Total_Time: $[ $multiple_ut_mem_0_endTime_s - $multiple_ut_mem_0_startTime_s ]s" 
+        echo "ipipe_log_param_2_mem0_TestCases_Total_Time: $[ $multiple_ut_mem_0_endTime_s - $multiple_ut_mem_0_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
+        multiple_ut_startTime_s=`date +%s`
+        while read line
+        do
+            num=$[(`echo $line | awk -F"$" '{print NF-1}'`-1)/6]
+            if [ $num -eq 0 ]; then
+                num=1
+            fi
+            card_test "$line" 2 $num 
+
+        done < $PADDLE_ROOT/tools/multiple_card_tests_new
+        multiple_ut_endTime_s=`date +%s`
+        echo "ipipe_log_param_2_TestCases_Total_Time: $[ $multiple_ut_endTime_s - $multiple_ut_startTime_s ]s" 
+        echo "ipipe_log_param_2_TestCases_Total_Time: $[ $multiple_ut_endTime_s - $multiple_ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
+
+        exclusive_ut_mem_0_startTime_s=`date +%s`
+        while read line
+        do
+            card_test "$line" -1 4
+        done < $PADDLE_ROOT/tools/exclusive_card_tests_mem0_new
+        exclusive_ut_mem_0_endTime_s=`date +%s`
+        echo "ipipe_log_param_-1_mem0_TestCases_Total_Time: $[ $exclusive_ut_mem_0_endTime_s - $exclusive_ut_mem_0_startTime_s ]s" 
+        echo "ipipe_log_param_-1_mem0_TestCases_Total_Time: $[ $exclusive_ut_mem_0_endTime_s - $exclusive_ut_mem_0_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
+
+        exclusive_ut_startTime_s=`date +%s`
+        while read line
+        do
+            num=$[(`echo $line | awk -F"$" '{print NF-1}'`-1)/6]
+            if [ $num -eq 0 ]; then
+                num=1
+            fi
+            card_test "$line" -1 $num 
+        done < $PADDLE_ROOT/tools/exclusive_card_tests_new
+        exclusive_ut_endTime_s=`date +%s`
+        echo "ipipe_log_param_-1_TestCases_Total_Time: $[ $exclusive_ut_endTime_s - $exclusive_ut_startTime_s ]s"
+        echo "ipipe_log_param_-1_TestCases_Total_Time: $[ $exclusive_ut_endTime_s - $exclusive_ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
+        
+        noparallel_ut_startTime_s=`date +%s`
+        while read line
+        do
+            card_test "$line" -1 2
+        done < $PADDLE_ROOT/tools/no_parallel_case_file
+        noparallel_ut_endTime_s=`date +%s`
+        echo "ipipe_log_param_noparallel_TestCases_Total_Time: $[ $noparallel_ut_endTime_s - $noparallel_ut_startTime_s ]s"
+        echo "ipipe_log_param_noparallel_TestCases_Total_Time: $[ $noparallel_ut_endTime_s - $noparallel_ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt   
+        ###retry
+        collect_failed_tests
+        rm -f $tmp_dir/*
+        exec_times=0
+        retry_unittests_record=''
+        retry_time=4
+        exec_time_array=('first' 'second' 'third' 'fourth')
+        parallel_failed_tests_exec_retry_threshold=120
+        exec_retry_threshold=30
+        is_retry_execuate=0
+        rerun_ut_startTime_s=`date +%s`
+        if [ -n "$failed_test_lists" ];then
+            if [ ${TIMEOUT_DEBUG_HELP:-OFF} == "ON" ];then
+                bash $PADDLE_ROOT/tools/timeout_debug_help.sh "$failed_test_lists"    # cat logs for tiemout uts which killed by ctest
+            fi
+            read need_retry_ut_str <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+            need_retry_ut_arr=(${need_retry_ut_str})
+            need_retry_ut_count=${#need_retry_ut_arr[@]}
+            read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+            while ( [ $exec_times -lt $retry_time ] )
+                do
+                    if [[ "${exec_times}" == "0" ]] ;then
+                        if [ $need_retry_ut_count -lt $parallel_failed_tests_exec_retry_threshold ];then
+                            is_retry_execuate=0
+                        else
+                            is_retry_execuate=1
+                        fi
+                    elif [[ "${exec_times}" == "1" ]] ;then
+                        read need_retry_ut_str <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                        need_retry_ut_arr=(${need_retry_ut_str})
+                        need_retry_ut_count=${#need_retry_ut_arr[@]} 
+                        if [ $need_retry_ut_count -lt $exec_retry_threshold ];then
+                            is_retry_execuate=0
+                        else
+                            is_retry_execuate=1
+                        fi
+                    fi
+                    if [[ "$is_retry_execuate" == "0" ]];then
+                        set +e
+                        retry_unittests_record="$retry_unittests_record$failed_test_lists"
+                        failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'`
+                        set -e
+                        if [[ "${exec_times}" == "1" ]] || [[ "${exec_times}" == "2" ]];then
+                            if [[ "${failed_test_lists}" == "" ]];then
+                                break
+                            else
+                                read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                            fi
+                        fi
+                        echo "========================================="
+                        echo "This is the ${exec_time_array[$exec_times]} time to re-run"
+                        echo "========================================="
+                        echo "The following unittest will be re-run:"
+                        echo "${retry_unittests}"                    
+                        for line in ${retry_unittests[@]} ;
+                            do
+                                if [[ "$retry_cases" == "" ]]; then
+                                    retry_cases="^$line$"
+                                else
+                                    retry_cases="$retry_cases|^$line$"
+                                fi
+                            done
+
+                        if [[ "$retry_cases" != "" ]]; then
+                            card_test "$retry_cases" -1 2
+                        fi
+                        exec_times=$[$exec_times+1]
+                        failed_test_lists=''
+                        collect_failed_tests
+                        rm -f $tmp_dir/*
+                        retry_cases=''
+                    else 
+                        break
+                    fi 
+                done
+            retry_unittests_record="$retry_unittests_record$failed_test_lists"
+        fi
+        rerun_ut_endTime_s=`date +%s`
+        echo "ipipe_log_param_Rerun_TestCases_Total_Time: $[ $rerun_ut_endTime_s - $rerun_ut_startTime_s ]s"
+        echo "ipipe_log_param_Rerun_TestCases_Total_Time: $[ $rerun_ut_endTime_s - $rerun_ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
+        cp $PADDLE_ROOT/build/Testing/Temporary/CTestCostData.txt ${cfs_dir}/coverage/${AGILE_PULL_ID}/${AGILE_REVISION}/
+        if [[ "$EXIT_CODE" != "0" ]]; then
+            show_ut_retry_result
+        fi
+set -ex
+    fi
+}
+
 function parallel_test_base_ipu() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/ipu
@@ -2424,7 +2715,7 @@ function parallel_test() {
     if [ "$WITH_CINN" == "ON" ];then
         parallel_test_base_cinn
     elif [ "$WITH_GPU" == "ON" ] || [ "$WITH_ROCM" == "ON" ];then
-        parallel_test_base_gpu
+        parallel_test_base_gpu_test
     elif [ "$WITH_XPU" == "ON" ];then
         parallel_test_base_xpu
     elif [ "$WITH_ASCEND_CL" == "ON" ];then
@@ -3127,7 +3418,6 @@ function main() {
         parallel_test
         ;;
       cicheck_coverage)
-        check_approvals_of_unittest 1
         check_diff_file_for_coverage
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
         enable_unused_var_check
@@ -3136,13 +3426,11 @@ function main() {
         check_change_of_unittest ${PYTHON_ABI:-""}
         ;;
       cpu_cicheck_coverage)
-        check_approvals_of_unittest 1
         check_diff_file_for_coverage
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
         enable_unused_var_check
         ;;
       gpu_cicheck_coverage)
-        check_approvals_of_unittest 1
         parallel_test
         check_coverage
         check_change_of_unittest ${PYTHON_ABI:-""}
@@ -3294,6 +3582,10 @@ function main() {
         # only test trt convert.
         trt_convert_test
         ;;
+      classify_case_by_cardNum)
+        # only class case by card num
+        classify_case_by_cardNum
+        ;;
       *)
         print_usage
         exit 1
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 132105fb2b689..194c2e8cce4fc 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -75,7 +75,6 @@
 import paddle.reader  # noqa: F401
 import paddle.static  # noqa: F401
 import paddle.vision  # noqa: F401
-import paddle.sparse  # noqa: F401
 
 from .tensor.attribute import is_complex  # noqa: F401
 from .tensor.attribute import is_integer  # noqa: F401
@@ -331,6 +330,7 @@
 from .tensor.stat import var  # noqa: F401
 from .tensor.stat import numel  # noqa: F401
 from .tensor.stat import median  # noqa: F401
+from .tensor.stat import nanmedian  # noqa: F401
 from .tensor.stat import quantile  # noqa: F401
 from .tensor.stat import nanquantile  # noqa: F401
 from .device import get_cudnn_version  # noqa: F401
@@ -498,6 +498,7 @@
            'load',
            'numel',
            'median',
+           'nanmedian',
            'quantile',
            'nanquantile',
            'no_grad',
diff --git a/python/paddle/distributed/auto_parallel/completion.py b/python/paddle/distributed/auto_parallel/completion.py
index 31bdc4cc650af..465c450c0b076 100644
--- a/python/paddle/distributed/auto_parallel/completion.py
+++ b/python/paddle/distributed/auto_parallel/completion.py
@@ -20,7 +20,7 @@
 from paddle.fluid import framework
 
 from .utils import print_program_with_dist_attr
-from .operators import find_best_compatible_distributed_operator_impl
+from .operators import find_compatible_distributed_operator_impls
 from .dist_context import get_default_distributed_context, _node_id
 from .dist_tensor import DistributedTensor
 from .dist_op import DistributedOperator
@@ -238,13 +238,17 @@ def _update_op_node_dims_mapping(self, op_node, fwd=True):
                             tensor_desc.name())
                         compatible_dims_mapping = compute_compatible_dims_mapping(
                             [op_dims_mapping, tensor_dims_mapping])
+                        if not _validate_dims_mapping(
+                                compatible_dims_mapping,
+                                op_dist_attr.process_mesh):
+                            continue
                         if (compatible_dims_mapping is not None) and \
                             (compatible_dims_mapping != op_dims_mapping):
                             op_dist_attr.set_input_dims_mapping(
                                 tensor_desc.name(), compatible_dims_mapping)
                             changed = True
             # Find the most compatible implemenetations from the distributed operator
-            op_dist_impls = find_best_compatible_distributed_operator_impl(
+            op_dist_impls = find_compatible_distributed_operator_impls(
                 dist_op, fwd=True)
             if op_dist_impls is not None:
                 not_compatible = True
@@ -254,7 +258,8 @@ def _update_op_node_dims_mapping(self, op_node, fwd=True):
                     dim_changed = op_dist_impl.update_dims_mapping(dist_op)
                     if dim_changed:
                         changed = True
-                    if op_dist_impl.is_auto_compatible(dist_op):
+                    if op_dist_impl.is_auto_compatible(dist_op) \
+                        and dist_op.validate_dist_attr():
                         if op_dist_impl.type == "elementwise":
                             op_dist_attr.impl_type = "default"
                         else:
@@ -289,13 +294,17 @@ def _update_op_node_dims_mapping(self, op_node, fwd=True):
                             tensor_desc.name())
                         compatible_dims_mapping = compute_compatible_dims_mapping(
                             [op_dims_mapping, tensor_dims_mapping])
+                        if not _validate_dims_mapping(
+                                compatible_dims_mapping,
+                                op_dist_attr.process_mesh):
+                            continue
                         if (compatible_dims_mapping is not None) and \
                             (compatible_dims_mapping != op_dims_mapping):
                             op_dist_attr.set_output_dims_mapping(
                                 tensor_desc.name(), compatible_dims_mapping)
                             changed = True
             # Find the most compatible implemenetations from the distributed operator
-            op_dist_impls = find_best_compatible_distributed_operator_impl(
+            op_dist_impls = find_compatible_distributed_operator_impls(
                 dist_op, fwd=False)
             if op_dist_impls is not None:
                 not_compatible = True
@@ -305,8 +314,8 @@ def _update_op_node_dims_mapping(self, op_node, fwd=True):
                     dim_changed = op_dist_impl.update_dims_mapping(dist_op)
                     if dim_changed:
                         changed = True
-                    if op_dist_impl.is_auto_compatible(dist_op):
-                        not_compatible = False
+                    if op_dist_impl.is_auto_compatible(dist_op) \
+                        and dist_op.validate_dist_attr():
                         if op_dist_impl.type == "elementwise":
                             op_dist_attr.impl_type = "default"
                         else:
@@ -352,6 +361,23 @@ def _update_dims_mapping_between_graphs(self):
                 changed = True
         return changed
 
+    def _update_dims_mapping_for_special(self):
+        # Set the dims_mapping of a tensor to the dims_mapping inside the op which produces it
+        op_nodes = self._dist_context._serial_ordered_op_nodes
+        for op_node in op_nodes:
+            op_dist_attr = self._dist_context.get_dist_attr_for_graph(op_node)
+            for tensor_node in op_node.outputs:
+                if tensor_node.is_var() and tensor_node.var() is not None:
+                    if tensor_node.var().type() == core.VarDesc.VarType.READER:
+                        continue
+                    tensor_desc = tensor_node.var()
+                    tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph(
+                        tensor_node)
+                    if op_dist_attr.process_mesh == tensor_dist_attr.process_mesh:
+                        op_dims_mapping = op_dist_attr.get_output_dims_mapping(
+                            tensor_desc.name())
+                        tensor_dist_attr.dims_mapping = op_dims_mapping
+
     def _update_dims_mapping(self):
         # Complete dims_mapping for each node
         reach_fix_point = False
@@ -378,6 +404,7 @@ def _update_dims_mapping(self):
                 reach_fix_point = False
             else:
                 reach_fix_point = True
+        self._update_dims_mapping_for_special()
 
     def _update_process_mesh_by_nearest(self, op_node, nearest_op_node):
         op_dist_attr = self._dist_context.get_dist_attr_for_graph(op_node)
@@ -685,7 +712,7 @@ def _update_process_mesh(self):
         # Step 3: adjust the process meshes for special ops
         self._update_process_mesh_for_specials()
 
-        # Step 4: adjust the process meshes between graphs 
+        # Step 4: adjust the process meshes between graphs
         self._update_process_mesh_between_graphs()
 
     def _prepare(self):
@@ -727,14 +754,14 @@ def complete_forward_annotation(self, serial_main_program=None):
         """ Complete annotation for the partial annotated serial_main_program.
         Arguments:
             serial_main_program: partial annotated serial_main_program.
-        Returns:
+        Returns:e
             serial_main_program: completed annotated serial_main_program.
         """
 
         if serial_main_program is None:
             serial_main_program = self._dist_context.serial_main_program
         else:
-            self._dist_context.serial_main_program = serial_main_program
+            self._dist_context._serial_main_program = serial_main_program
 
         self._dist_context.initialize()
 
@@ -757,13 +784,18 @@ def complete_forward_annotation(self, serial_main_program=None):
 
         return serial_main_program
 
-    def _complete_high_order_grad_annotation(self, serial_main_program):
+    def _complete_high_order_grad_annotation(self, serial_main_program=None):
         """
         NOTE: 
             [HighOrderGrad] Complete the annotation of vars and ops only for high order gradient.
             This function is temporary to support high order gradient, and will be removed in the future.
         """
 
+        if serial_main_program is None:
+            serial_main_program = self._dist_context.serial_main_program
+        else:
+            self._dist_context._serial_main_program = serial_main_program
+
         def _is_grad_var_name(name):
             if "@GRAD" in name:
                 return True
@@ -771,7 +803,7 @@ def _is_grad_var_name(name):
 
         def _get_op_by_id(ops, id):
             for op in ops:
-                if op.desc.id() == id:
+                if op.desc.original_id() == id:
                     return op
             return None
 
@@ -796,10 +828,12 @@ def _get_op_by_id(ops, id):
             # complete the annotation of grad op (xxx_grad op or sum op)
             # xxx_grad op will have a corresponding forward op in grad_op_id_to_op_id
             grad_op = ops[idx]
-            if grad_op.desc.id() in dist_op_context.grad_op_id_to_op_id:
+            if grad_op.desc.original_id(
+            ) in dist_op_context.grad_op_id_to_op_id:
                 # TODO support the case where one forward op corresponding to multiple xxx_grad op
-                forward_op = _get_op_by_id(
-                    ops, dist_op_context.grad_op_id_to_op_id[grad_op.desc.id()])
+                forward_op = _get_op_by_id(ops,
+                                           dist_op_context.grad_op_id_to_op_id[
+                                               grad_op.desc.original_id()])
                 assert forward_op is not None
 
                 fwd_op_dist_attr = self._dist_context.get_op_dist_attr_for_program(
@@ -915,12 +949,13 @@ def _get_op_by_id(ops, id):
                 self._dist_context.set_op_dist_attr_for_program(
                     grad_op, grad_op_dist_attr)
 
-    def complete_backward_annotation(self, serial_main_program):
+    def complete_backward_annotation(self, serial_main_program=None):
         """Complete the annotation of vars and ops in the backward phase for parallel program."""
+
         if serial_main_program is None:
             serial_main_program = self._dist_context.serial_main_program
         else:
-            self._dist_context.serial_main_program = serial_main_program
+            self._dist_context._serial_main_program = serial_main_program
 
         def _is_grad_var_name(name):
             if "@GRAD" in name:
@@ -935,7 +970,7 @@ def _get_forward_varname_from_grad_varname(grad_var_name):
 
         def _get_op_by_id(ops, id):
             for op in ops:
-                if op.desc.id() == id:
+                if op.desc.original_id() == id:
                     return op
             return None
 
@@ -997,11 +1032,12 @@ def _get_op_by_id(ops, id):
             # complete the annotation of grad op (xxx_grad op or sum op)
             # xxx_grad op will have a corresponding forward op in grad_op_id_to_op_id
             grad_op = ops[idx]
-            if grad_op.desc.id() in dist_op_context.grad_op_id_to_op_id:
+            if grad_op.desc.original_id(
+            ) in dist_op_context.grad_op_id_to_op_id:
                 # TODO support the case where one forward op corresponding to multiple xxx_grad op
-                forward_op = _get_op_by_id(
-                    ops[:first_backward_op_idx],
-                    dist_op_context.grad_op_id_to_op_id[grad_op.desc.id()])
+                forward_op = _get_op_by_id(ops[:first_backward_op_idx],
+                                           dist_op_context.grad_op_id_to_op_id[
+                                               grad_op.desc.original_id()])
                 assert forward_op is not None
 
                 if grad_op.type == "concat" and forward_op.type == "split":
@@ -1029,6 +1065,9 @@ def _get_op_by_id(ops, id):
                     grad_op_dist_attr.process_mesh = ref_mesh
                     self._dist_context.set_op_dist_attr_for_program(
                         grad_op, grad_op_dist_attr)
+                    grad_op_dist_attr.impl_type = fwd_op_dist_attr.impl_type
+                    grad_op_dist_attr.impl_idx = fwd_op_dist_attr.impl_idx
+
                     continue
 
                 fwd_op_dist_attr = self._dist_context.get_op_dist_attr_for_program(
@@ -1075,6 +1114,8 @@ def _get_op_by_id(ops, id):
                     grad_op_dist_attr.set_output_dims_mapping(output_name,
                                                               ref_dims_mapping)
 
+                grad_op_dist_attr.impl_type = fwd_op_dist_attr.impl_type
+                grad_op_dist_attr.impl_idx = fwd_op_dist_attr.impl_idx
                 self._dist_context.set_op_dist_attr_for_program(
                     grad_op, grad_op_dist_attr)
 
@@ -1108,6 +1149,8 @@ def _get_op_by_id(ops, id):
                             var_name, ref_fwd_dims_mapping)
                     grad_op_dist_attr.set_output_dims_mapping(
                         output_name, ref_fwd_dims_mapping)
+                    grad_op_dist_attr.impl_type = "default"
+                    grad_op_dist_attr.impl_idx = 0
 
                 elif grad_op.type == 'fill_zeros_like':
                     ref_var_name = grad_op.input_arg_names[0]
@@ -1139,12 +1182,13 @@ def _get_op_by_id(ops, id):
                 self._dist_context.set_op_dist_attr_for_program(
                     grad_op, grad_op_dist_attr)
 
-    def complete_update_annotation(self, serial_main_program=None):
+    def complete_update_annotation(self, serial_main_program):
         """Complete the annotation of vars and ops in the update phase for parallel program."""
-        if serial_main_program is None:
-            serial_main_program = self._dist_context.serial_main_program
-        else:
-            self._dist_context.serial_main_program = serial_main_program
+
+        # Notice: serial_main_program is actually a dist_main_program of current rank,
+        # and must be passed into this function. 
+        # TODO: We should fix this behavior.
+
         ops = list(serial_main_program.global_block().ops)
         vars = serial_main_program.global_block().vars
         learning_rate_completed = False
@@ -1301,7 +1345,7 @@ def _init_global_mesh_for_program(self):
                 dist_op.dist_attr.process_mesh = world_ranks
 
                 # Find the most compatible implemenetations from the distributed operator
-                op_dist_impls = find_best_compatible_distributed_operator_impl(
+                op_dist_impls = find_compatible_distributed_operator_impls(
                     dist_op, fwd=True)
                 if op_dist_impls is not None:
                     backup_op_dist_attr = copy.deepcopy(dist_op.dist_attr)
diff --git a/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py b/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py
index 28d2e2d5a3088..8958c4bf905c2 100644
--- a/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py
+++ b/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py
@@ -23,7 +23,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(AssignOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -41,7 +41,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(AssignValueOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -59,7 +59,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(BeamSearchOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -77,7 +77,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(BeamSearchDecodeOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -95,7 +95,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(CastOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -113,7 +113,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(ConcatOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -131,7 +131,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(ElementwiseAddOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -149,7 +149,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(ElementwiseAddGradOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -167,7 +167,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(ElementwiseDivOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -185,7 +185,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(ElementwiseDivGradOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -203,7 +203,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(ElementwiseMulOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -221,7 +221,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(ElementwiseMulGradOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -239,7 +239,25 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(ElementwiseSubOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class ElementwiseSubGradOpCost(CompOpCost):
+    OP_TYPE = "elementwise_sub_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(ElementwiseSubGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -257,7 +275,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(EmbeddingOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -275,7 +293,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(EmbeddingGradOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -293,7 +311,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(FillConstantOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -311,7 +329,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(FillConstantBatchSizeLikeOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -329,7 +347,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(FillConstantBatchSizeLikeGradOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -347,7 +365,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(GatherOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -365,7 +383,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(GeluOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -383,7 +401,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(GeluGradOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -401,7 +419,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(GreaterEqualOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -419,7 +437,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(IncrementOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -433,7 +451,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(IsEmptyOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -447,7 +465,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(LayerNormOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -465,7 +483,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(LayerNormGradOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -483,7 +501,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(LessThanOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -501,7 +519,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(LogicalNotOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -519,7 +537,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(LogicalAndOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -537,7 +555,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(LodResetOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -554,7 +572,7 @@ class LogOpCost(CompOpCost):
     def __init__(self, op=None, op_desc=None, cluster=None):
         super(LogOpCost, self).__init__(op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -572,7 +590,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(LookupTableV2OpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -590,7 +608,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(LookupTableV2GradOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -608,7 +626,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(MatmulOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -626,7 +644,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(MatmulGradOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -644,7 +662,527 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(MatmulV2OpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class MatmulV2GradOpCost(CompOpCost):
+    OP_TYPE = "matmul_v2_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(MatmulV2GradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class MemcpyOpCost(CompOpCost):
+    OP_TYPE = "memcpy"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(MemcpyOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class MulOpCost(CompOpCost):
+    OP_TYPE = "mul"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(MulOpCost, self).__init__(op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class MulGradOpCost(CompOpCost):
+    OP_TYPE = "mul_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(MulGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class OneHotOpCost(CompOpCost):
+    OP_TYPE = "one_hot"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(OneHotOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class ReadFromArrayOpCost(CompOpCost):
+    OP_TYPE = "read_from_array"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(ReadFromArrayOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class ReduceSumOpCost(CompOpCost):
+    OP_TYPE = "reduce_sum"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(ReduceSumOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class ReduceSumGradOpCost(CompOpCost):
+    OP_TYPE = "reduce_sum_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(ReduceSumGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class Reshape2OpCost(CompOpCost):
+    OP_TYPE = "reshape2"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(Reshape2OpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class Reshape2GradOpCost(CompOpCost):
+    OP_TYPE = "reshape2_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(Reshape2GradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class ReduceMeanOpCost(CompOpCost):
+    OP_TYPE = "reduce_mean"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(ReduceMeanOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class ReduceMeanGradOpCost(CompOpCost):
+    OP_TYPE = "reduce_mean_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(ReduceMeanGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class SamplingIdOpCost(CompOpCost):
+    OP_TYPE = "sampling_id"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(SamplingIdOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class ScaleOpCost(CompOpCost):
+    OP_TYPE = "scale"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(ScaleOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class SliceOpCost(CompOpCost):
+    OP_TYPE = "slice"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(SliceOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class SoftmaxOpCost(CompOpCost):
+    OP_TYPE = "softmax"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(SoftmaxOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class SoftmaxGradOpCost(CompOpCost):
+    OP_TYPE = "softmax_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(SoftmaxGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class SoftmaxWithCrossEntropyOpCost(CompOpCost):
+    OP_TYPE = "softmax_with_cross_entropy"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(SoftmaxWithCrossEntropyOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class SoftmaxWithCrossEntropyGradOpCost(CompOpCost):
+    OP_TYPE = "softmax_with_cross_entropy_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(SoftmaxWithCrossEntropyGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class SplitOpCost(CompOpCost):
+    OP_TYPE = "split"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(SplitOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class Squeeze2OpCost(CompOpCost):
+    OP_TYPE = "squeeze2"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(Squeeze2OpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class SquareOpCost(CompOpCost):
+    OP_TYPE = "square"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(SquareOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class SquareGradOpCost(CompOpCost):
+    OP_TYPE = "square_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(SquareGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class SumOpCost(CompOpCost):
+    OP_TYPE = "sum"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(SumOpCost, self).__init__(op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class TopKOpCost(CompOpCost):
+    OP_TYPE = "top_k"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(TopKOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class Transpose2OpCost(CompOpCost):
+    OP_TYPE = "transpose2"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(Transpose2OpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class Transpose2GradOpCost(CompOpCost):
+    OP_TYPE = "transpose2_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(Transpose2GradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class Unsqueeze2OpCost(CompOpCost):
+    OP_TYPE = "unsqueeze2"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(Unsqueeze2OpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class WriteToArrayOpCost(CompOpCost):
+    OP_TYPE = "write_to_array"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(WriteToArrayOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
diff --git a/python/paddle/distributed/auto_parallel/dist_attribute.py b/python/paddle/distributed/auto_parallel/dist_attribute.py
index 6fa5b756c75c3..3dbdb79f48541 100644
--- a/python/paddle/distributed/auto_parallel/dist_attribute.py
+++ b/python/paddle/distributed/auto_parallel/dist_attribute.py
@@ -132,15 +132,17 @@ def init(self, dist_attr):
                         key, dist_attr)
             self._is_annotated = copy.deepcopy(dist_attr._is_annotated)
 
-    # def reset(self, skip_dist_attr_field_names):
-    #     if skip_dist_attr_field_names is not None \
-    #         and "process_mesh" not in skip_dist_attr_field_names:
-    #         self._process_mesh = None
-    #     if skip_dist_attr_field_names is not None \
-    #         and "dims_mapping" not in skip_dist_attr_field_names:
-    #         for i in enumerate(self._dims_mapping):
-    #             self._dims_mapping[i] = -1
-    #     self._is_annotated = {}
+    def reset(self, skip_dist_attr_field_names=None):
+        if skip_dist_attr_field_names is None or \
+            (skip_dist_attr_field_names is not None \
+                and "process_mesh" not in skip_dist_attr_field_names):
+            self._process_mesh = None
+        if skip_dist_attr_field_names is None or \
+            (skip_dist_attr_field_names is not None \
+                and "dims_mapping" not in skip_dist_attr_field_names):
+            for i, _ in enumerate(self._dims_mapping):
+                self._dims_mapping[i] = -1
+        self._is_annotated = {}
 
     def is_annotated(self, dist_attr_field_name):
         return self._is_annotated.get(dist_attr_field_name, False)
@@ -272,6 +274,9 @@ def set_input_dist_attr(self, name, dist_attr):
         dist_attr_object.init(dist_attr)
         self._inputs_dist_attrs[name] = dist_attr_object
 
+    # def del_input_dist_attr(self, name):
+    #     del self._inputs_dist_attrs[name]
+
     def get_output_dist_attr(self, name):
         return self._outputs_dist_attrs.get(name, None)
 
@@ -280,6 +285,9 @@ def set_output_dist_attr(self, name, dist_attr):
         dist_attr_object.init(dist_attr)
         self._outputs_dist_attrs[name] = dist_attr_object
 
+    # def del_output_dist_attr(self, name):
+    #     del self._inputs_dist_attrs[name]
+
     def get_input_dims_mapping(self, name):
         input_dist_attr = self.get_input_dist_attr(name)
         if input_dist_attr:
@@ -374,17 +382,18 @@ def init(self, dist_attr):
                         "ProcessMeshes in DistributedOperator must be the same."
         self.process_mesh = shared_process_mesh
 
-    # def reset(self, skip_dist_attr_field_names):
-    #     for tensor_dist_attr in self.inputs_dist_attrs.values():
-    #         tensor_dist_attr.reset(skip_dist_attr_field_names)
-    #     for tensor_dist_attr in self.outputs_dist_attrs.values():
-    #         tensor_dist_attr.reset(skip_dist_attr_field_names)
-    #     if skip_dist_attr_field_names is not None \
-    #         and "process_mesh" not in skip_dist_attr_field_names:
-    #         self.process_mesh = None
-    #     self.impl_type = "default"
-    #     self.impl_idx = 0
-    #     self._is_annotated = {}
+    def reset(self, skip_dist_attr_field_names=None):
+        for tensor_dist_attr in self.inputs_dist_attrs.values():
+            tensor_dist_attr.reset(skip_dist_attr_field_names)
+        for tensor_dist_attr in self.outputs_dist_attrs.values():
+            tensor_dist_attr.reset(skip_dist_attr_field_names)
+        if skip_dist_attr_field_names is None or \
+            (skip_dist_attr_field_names is not None \
+                and "process_mesh" not in skip_dist_attr_field_names):
+            self._process_mesh = None
+        self.impl_type = "default"
+        self.impl_idx = 0
+        self._is_annotated = {}
 
     def is_annotated(self, attr_name):
         return self._is_annotated.get(attr_name, False)
diff --git a/python/paddle/distributed/auto_parallel/dist_context.py b/python/paddle/distributed/auto_parallel/dist_context.py
index 7299f84504bf3..6a38b53cf2c10 100644
--- a/python/paddle/distributed/auto_parallel/dist_context.py
+++ b/python/paddle/distributed/auto_parallel/dist_context.py
@@ -57,33 +57,30 @@ def __init__(self,
                  serial_startup_prog=None,
                  serial_optimizer=None,
                  serial_loss=None,
-                 feed_vars=None,
-                 fetch_vars=None,
+                 feed_vars={},
+                 fetch_vars={},
+                 cluster=None,
                  strategy=None):
         # Data members related to original programs (unchanged)
         self._original_serial_main_program = serial_main_prog
         self._original_serial_startup_program = serial_startup_prog
+        self._original_serial_optimizer = serial_optimizer
         self._original_serial_loss = serial_loss
+        self._original_serial_feed_vars = feed_vars
+        self._original_serial_fetch_vars = fetch_vars
         self._original_serial_optimizer = serial_optimizer
-        if self._original_serial_main_program is None:
-            self._original_serial_main_program = paddle.fluid.default_main_program(
-            )
-        if self._original_serial_startup_program is None:
-            self._original_serial_startup_program = paddle.fluid.default_startup_program(
-            )
 
         # Data members related to programs (changed)
         self._serial_main_program = None
         self._serial_startup_program = None
-        self._serial_loss = serial_loss
-        self._serial_optimizer = serial_optimizer
-        self._serial_feed_vars = feed_vars
-        self._serial_fetch_vars = fetch_vars
+        self._serial_loss = None
+        self._serial_optimizer = None
+        self._serial_feed_vars = {}
+        self._serial_fetch_vars = {}
 
         # Data members related to the program
         self._dist_tensors_for_program = {}
         self._dist_ops_for_program = {}
-        self._block_state = BlockState()
 
         # Data members related to the graph
         self._serial_graph = None
@@ -96,24 +93,30 @@ def __init__(self,
         # Distributed programs
         self._dist_main_programs = {}
         self._dist_startup_programs = {}
+        self._dist_op_context = DistributedOperatorContext()
+        self._process_meshes = []
 
-        # Distributed Strategy
+        self._cluster = cluster
         self._strategy = strategy
 
         # Pass Context
         self._pass_context = PassContext()
-
-        # Distributed Operator Context
-        self._dist_op_context = DistributedOperatorContext()
+        self._block_state = BlockState()
 
         # Other data members
-        self._process_meshes = []
         self._serial_ordered_tensor_nodes = []
         self._serial_ordered_op_nodes = []
         self._serial_ordered_nodes = []
         # self._tensor_id_to_tensor_node_ids = {}
 
         self._is_initialized = False
+        self._need_copy_dist_attr_to_graph = False
+        self._backup_pass_context_stack = []
+        self._backup_block_state_stack = []
+        self._backup_dist_tensors_for_program_stack = []
+        self._backup_dist_ops_for_program_stack = []
+        self._backup_serial_main_program_stack = []
+        self._backup_serial_startup_program_stack = []
 
         # flag whether scale gradient with dp size
         self._gradient_scale = True
@@ -122,13 +125,6 @@ def __init__(self,
     def serial_main_program(self):
         return self._serial_main_program
 
-    @serial_main_program.setter
-    def serial_main_program(self, program):
-        # if self._serial_main_program:
-        #     print("WARNING: The program attached to this distributed context will be replaced by the new one.")
-        self._original_serial_main_program = program
-        self._serial_main_program = program
-
     @property
     def serial_startup_program(self):
         return self._serial_startup_program
@@ -149,6 +145,18 @@ def serial_feed_vars(self):
     def serial_fetch_vars(self):
         return self._serial_fetch_vars
 
+    @property
+    def dist_main_programs(self):
+        return self._dist_main_programs
+
+    @property
+    def dist_startup_programs(self):
+        return self._dist_startup_programs
+
+    @property
+    def cluster(self):
+        return self._cluster
+
     @property
     def strategy(self):
         return self._strategy
@@ -177,14 +185,6 @@ def dist_op_context(self):
     def block_state(self):
         return self._block_state
 
-    @property
-    def dist_main_programs(self):
-        return self._dist_main_programs
-
-    @property
-    def dist_startup_programs(self):
-        return self._dist_startup_programs
-
     @property
     def has_annotation(self):
         return len(self._dist_tensors_for_program) or len(
@@ -198,17 +198,168 @@ def gradient_scale(self):
     def gradient_scale(self, gs):
         self._gradient_scale = gs
 
-    def initialize(self):
-        if not self._is_initialized:
+    def _backup_serial_info(self, mode):
+        self._backup_serial_main_program_stack.append(
+            self._serial_main_program.clone())
+        self._backup_serial_startup_program_stack.append(
+            self._serial_startup_program.clone())
+        self._backup_pass_context_stack.append(
+            copy.deepcopy(self._pass_context))
+        self._backup_block_state_stack.append(copy.deepcopy(self._block_state))
+
+    def _backup_dist_info(self, mode):
+        self._backup_dist_tensors_for_program_stack.append(
+            copy.deepcopy(self._dist_tensors_for_program))
+        self._backup_dist_ops_for_program_stack.append(
+            copy.deepcopy(self._dist_ops_for_program))
+
+    def _backup(self, serial=True, serial_mode=None, dist=True, dist_mode=None):
+        # Use this function carefully
+        if serial:
+            self._backup_serial_info(serial_mode)
+        if dist:
+            self._backup_dist_info(dist_mode)
+
+    def _restore_serial_info(self, mode="to_backup"):
+        if mode == "to_backup":
+            self._serial_main_program = self._backup_serial_main_program_stack.pop(
+            )
+            self._serial_startup_program = self._backup_serial_startup_program_stack.pop(
+            )
+        elif mode == "to_original":
+            assert self._original_serial_main_program is not None
+            assert self._original_serial_startup_program is not None
             self._serial_main_program = self._original_serial_main_program.clone(
             )
             self._serial_startup_program = self._original_serial_startup_program.clone(
             )
-            self._serial_main_program = self._original_serial_main_program
-            self._serial_startup_program = self._original_serial_startup_program
-            self._serial_loss = self._original_serial_loss
-            self._serial_optimizer = self._original_serial_optimizer
+
+        self._serial_optimizer = self._original_serial_optimizer
+
+        if self._original_serial_loss:
+            if isinstance(self._original_serial_loss, list):
+                assert len(self._original_serial_loss) == 1
+                loss = self._original_serial_loss[0]
+                block_idx = loss.block.idx
+                var_name = loss.name
+                var = self._serial_main_program.blocks[
+                    block_idx]._var_recursive(var_name)
+                self._serial_loss = var
+            else:
+                block_idx = self._original_serial_loss.block.idx
+                var_name = self._original_serial_loss.name
+                var = self._serial_main_program.blocks[
+                    block_idx]._var_recursive(var_name)
+                self._serial_loss = var
+
+        for key, var_list in self._original_serial_feed_vars.items():
+            new_var_list = []
+            for var in var_list:
+                block_idx = var.block.idx
+                var_name = var.name
+                var = self._serial_main_program.blocks[
+                    block_idx]._var_recursive(var_name)
+                new_var_list.append(var)
+            self._serial_feed_vars[key] = new_var_list
+
+        for key, var_list in self._original_serial_fetch_vars.items():
+            new_var_list = []
+            for var in var_list:
+                block_idx = var.block.idx
+                var_name = var.name
+                var = self._serial_main_program.blocks[
+                    block_idx]._var_recursive(var_name)
+                new_var_list.append(var)
+            self._serial_fetch_vars[key] = new_var_list
+
+        self._pass_context = self._backup_pass_context_stack.pop()
+        self._block_state = self._backup_block_state_stack.pop()
+
+    def _restore_dist_info(self, mode="to_backup"):
+        if mode == "to_backup":
+            self._dist_tensors_for_program = self._backup_dist_tensors_for_program_stack.pop(
+            )
+            self._dist_ops_for_program = self._backup_dist_ops_for_program_stack.pop(
+            )
+        elif mode == "to_original":
+            assert self._original_dist_tensors_for_program
+            assert self._original_dist_ops_for_program
+            self._dist_tensors_for_program = copy.deepcopy(
+                self._original_dist_tensors_for_program)
+            self._dist_ops_for_program = copy.deepcopy(
+                self._original_dist_ops_for_program)
+        elif mode == "to_default":
+            new_tensors_ids = []
+            for tensor_id, dist_tensor in self._dist_tensors_for_program.items(
+            ):
+                if tensor_id in self._tensors_ids:
+                    dist_tensor.dist_attr.reset()
+                else:
+                    new_tensors_ids.append(tensor_id)
+            for tensor_id in new_tensors_ids:
+                self._dist_tensors_for_program.pop(tensor_id)
+            new_ops_ids = []
+            for op_id, dist_op in self._dist_ops_for_program.items():
+                if op_id in self._ops_ids:
+                    dist_op.dist_attr.reset()
+                else:
+                    new_ops_ids.append(op_id)
+            for op_id in new_ops_ids:
+                self._dist_ops_for_program.pop(op_id)
+        else:
+            new_tensors_ids = []
+            for tensor_id, dist_tensor in self._dist_tensors_for_program.items(
+            ):
+                new_tensors_ids.append(tensor_id)
+            for tensor_id in new_tensors_ids:
+                self._dist_tensors_for_program.pop(tensor_id)
+            new_ops_ids = []
+            for op_id, dist_op in self._dist_ops_for_program.items():
+                new_ops_ids.append(op_id)
+            for op_id in new_ops_ids:
+                self._dist_ops_for_program.pop(op_id)
+        self._dist_main_programs = {}
+        self._dist_startup_programs = {}
+        self._dist_op_context = DistributedOperatorContext()
+        self._need_copy_dist_attr_to_graph = True
+        self._process_meshes = []
+
+    def _restore(self,
+                 serial=True,
+                 serial_mode="to_backup",
+                 dist=True,
+                 dist_mode="to_backup"):
+        # Use this function carefully
+        if serial:
+            self._restore_serial_info(serial_mode)
+        if dist:
+            self._restore_dist_info(dist_mode)
+
+    def initialize(self):
+        if not self._is_initialized:
+            if not self._serial_main_program:
+                self._serial_main_program = self._original_serial_main_program
+            if not self._serial_startup_program:
+                self._serial_startup_program = self._original_serial_startup_program
+            if not self._serial_loss:
+                if isinstance(self._original_serial_loss, list):
+                    assert len(self._original_serial_loss) == 1
+                    self._serial_loss = self._original_serial_loss[0]
+                else:
+                    self._serial_loss = self._original_serial_loss
+            if not self._serial_optimizer:
+                self._serial_optimizer = self._original_serial_optimizer
+            if not self._serial_feed_vars:
+                self._serial_feed_vars = self._original_serial_feed_vars
+            if not self._serial_fetch_vars:
+                self._serial_fetch_vars = self._original_serial_fetch_vars
+
             self._init_dist_attr_for_program()
+            # Backup the original distributed information for later restore
+            self._original_dist_tensors_for_program = copy.deepcopy(
+                self._dist_tensors_for_program)
+            self._original_dist_ops_for_program = copy.deepcopy(
+                self._dist_ops_for_program)
             self._tensors_ids = list(self._dist_tensors_for_program.keys())
             self._ops_ids = list(self._dist_ops_for_program.keys())
             set_flags({"FLAGS_convert_all_blocks": True})
@@ -216,41 +367,9 @@ def initialize(self):
                 core.Graph(self._serial_main_program.desc))
             self._init_dist_attr_for_graph()
             self._is_initialized = True
-
-    # def reset(self,
-    #           skip_dist_tensors=None,
-    #           skip_dist_ops=None,
-    #           skip_tensor_dist_attr_fields=None,
-    #           skip_op_dist_attr_fields=None):
-    #     self._serial_main_program = self._original_serial_main_program.clone()
-    #     self._serial_startup_program = self._original_serial_startup_program.clone()
-    #     new_tensors_ids = []
-    #     for tensor_id, dist_tensor in self._dist_tensors_for_program.items():
-    #         if tensor_id in self._tensors_ids:
-    #             dist_tensor.dist_attr.reset(skip_tensor_dist_attr_fields)
-    #         else:
-    #             new_tensors_ids.append(tensor_id)
-    #     for tensor_id in new_tensors_ids:
-    #         self._dist_tensors_for_program.pop(tensor_id)
-    #     new_ops_ids = []
-    #     for op_id, dist_op in self._dist_ops_for_program.items():
-    #         if op_id in self._ops_ids:
-    #             dist_op.dist_attr.reset(skip_op_dist_attr_fields)
-    #         else:
-    #             new_ops_ids.append(op_id)
-    #     for op_id in new_ops_ids:
-    #         self._dist_ops_for_program.pop(op_id)
-
-    #     self.copy_dist_attr_from_program_to_graph()
-
-    #     self._dist_main_programs = {}
-    #     self._dist_startup_programs = {}
-
-    #     self._pass_context = PassContext()
-
-    #     self._dist_op_context = DistributedOperatorContext()
-
-    #     self._process_meshes = []
+            self._need_copy_dist_attr_to_graph = False
+        if self._need_copy_dist_attr_to_graph:
+            self.copy_dist_attr_from_program_to_graph()
 
     def add_process_mesh(self, process_mesh):
         assert isinstance(process_mesh, ProcessMesh), \
@@ -419,6 +538,10 @@ def _init_dist_attr_for_program(self, no_default=False):
                 if current_dist_op is None:
                     dist_op = DistributedOperator(op)
                     self.add_dist_op_for_program(dist_op)
+        self._original_dist_tensors_for_program = copy.deepcopy(
+            self._dist_tensors_for_program)
+        self._original_dist_ops_for_program = copy.deepcopy(
+            self._dist_ops_for_program)
 
     def _order_nodes_by_program_order(self):
         def _contains(nodes, target_node):
@@ -588,7 +711,7 @@ def copy_dist_attr_from_graph_to_program(self):
                 op_dist_attr_for_graph = self.get_op_dist_attr_for_graph(node)
                 dist_op_for_program = self._dist_ops_for_program[op_id]
                 dist_op_for_program.dist_attr = op_dist_attr_for_graph
-        # TODO: the completion algorithm will skip orphan tensors,
+        # TODO: the completion algorithm will skipped orphan tensors,
         # here we just set there process_mesh to the first one.
         for orphan_node in self._serial_orphan_tensor_nodes:
             serial_tensor_id = orphan_node.var().id()
@@ -614,16 +737,21 @@ def amend_dist_attr_for_program(self):
                 tensor_shape = serial_tensor.shape
             dims_mapping = dist_attr.dims_mapping
             process_mesh_shape = dist_attr.process_mesh.topology
+            process_mesh_processes = dist_attr.process_mesh.processes
             # If the dimension of tensor is less than the sharding dimension of process mesh,
             # we just amend the dimension mapping to -1. (Is this really OK?)
             for i in range(len(tensor_shape)):
                 if dims_mapping[i] != -1 and tensor_shape[i] > 0 \
                     and process_mesh_shape[dims_mapping[i]] > tensor_shape[i]:
                     dims_mapping[i] = -1
+                if dims_mapping[i] != -1 and len(process_mesh_processes) == 1:
+                    dims_mapping[i] = -1
 
         for dist_op in self._dist_ops_for_program.values():
             serial_op = dist_op.serial_op
             dist_attr = dist_op.dist_attr
+            process_mesh_shape = dist_attr.process_mesh.topology
+            process_mesh_processes = dist_attr.process_mesh.processes
             for arg_name in serial_op.input_arg_names:
                 if dist_op.get_serial_input(arg_name) is None:
                     tensor_shape = []
@@ -635,13 +763,15 @@ def amend_dist_attr_for_program(self):
                     else:
                         tensor_shape = dist_op.get_serial_input(arg_name).shape
                 dims_mapping = dist_attr.get_input_dims_mapping(arg_name)
-                process_mesh_shape = dist_attr.process_mesh.topology
                 # If the dimension of tensor is less than the sharding dimension of process mesh,
                 # we just amend the dimension mapping to -1. (Is this really OK?)
                 for i in range(len(tensor_shape)):
                     if dims_mapping[i] != -1 and tensor_shape[i] > 0 \
                         and process_mesh_shape[dims_mapping[i]] > tensor_shape[i]:
                         dims_mapping[i] = -1
+                    if dims_mapping[i] != -1 and len(
+                            process_mesh_processes) == 1:
+                        dims_mapping[i] = -1
             for arg_name in serial_op.output_arg_names:
                 if dist_op.get_serial_output(arg_name).type == core.VarDesc.VarType.READER \
                     or dist_op.get_serial_output(arg_name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
@@ -650,13 +780,18 @@ def amend_dist_attr_for_program(self):
                 else:
                     tensor_shape = dist_op.get_serial_output(arg_name).shape
                 dims_mapping = dist_attr.get_output_dims_mapping(arg_name)
-                process_mesh_shape = dist_attr.process_mesh.topology
                 # If the dimension of tensor is less than the sharding dimension of process mesh,
                 # we just amend the dimension mapping to -1. (Is this really OK?)
                 for i in range(len(tensor_shape)):
                     if dims_mapping[i] != -1 and tensor_shape[i] > 0 \
                         and process_mesh_shape[dims_mapping[i]] > tensor_shape[i]:
                         dims_mapping[i] = -1
+                    if dims_mapping[i] != -1 and len(
+                            process_mesh_processes) == 1:
+                        dims_mapping[i] = -1
+            if len(process_mesh_processes) == 1:
+                dist_op.dist_attr.impl_type = "default"
+                dist_op.dist_attr.impl_idx = 0
 
     def validate_dist_attr_for_program(self):
         if not self._is_initialized:
@@ -670,16 +805,20 @@ def validate_dist_attr_for_program(self):
                         dist_tensor.serial_tensor.name)
                 if (dist_tensor is not None) and (
                         not dist_tensor.validate_dist_attr()):
-                    assert False, "Tensor {} has a wrong distributed attributes {}.".format(
-                        dist_tensor.serial_tensor.name, dist_tensor.dist_attr)
+                    assert False, "Tensor {} (id: {}, original_id: {}) has a wrong distributed attributes {}.".format(
+                        dist_tensor.serial_tensor.name,
+                        dist_tensor.desc.id(),
+                        dist_tensor.desc.original_id(), dist_tensor.dist_attr)
             for op in block.ops:
                 dist_op = self.get_dist_op_for_program(op)
                 assert dist_op is not None, \
                     "Operator {} does not have a distributed attribute.".format(
                         dist_op.serial_op.type)
                 if (dist_op is not None) and (not dist_op.validate_dist_attr()):
-                    assert False, "Operator {} has a wrong distributed attributes {}.".format(
-                        dist_op.serial_op.type, dist_op.dist_attr)
+                    assert False, "Operator {} (id: {}, original_id: {}) has a wrong distributed attributes {} .".format(
+                        dist_op.serial_op.type,
+                        dist_op.serial_op.desc.id(),
+                        dist_op.serial_op.desc.original_id(), dist_op.dist_attr)
         return True
 
     def __deepcopy__(self, memo):
diff --git a/python/paddle/distributed/auto_parallel/dist_tensor.py b/python/paddle/distributed/auto_parallel/dist_tensor.py
index a42ce863492b3..e3f06da275182 100644
--- a/python/paddle/distributed/auto_parallel/dist_tensor.py
+++ b/python/paddle/distributed/auto_parallel/dist_tensor.py
@@ -41,7 +41,7 @@ def _validate_sizes_and_dist_attr(sizes,
                                       rank=None,
                                       shard_sizes=None):
         if not (isinstance(sizes, (list, tuple)) and
-                all(map(lambda x: isinstance(x, int) and x > 0, sizes))):
+                all(map(lambda x: isinstance(x, int) and x >= 0, sizes))):
             raise ValueError(
                 "The sizes must be list or tuple and item in sizes must be non-negative integer, but got {}".
                 format(sizes))
@@ -79,8 +79,11 @@ def get_local_sizes(global_sizes,
 
         local_sizes = []
         # for even sharding, the local sizes of every rank are equal
+
         for idx, item in enumerate(global_sizes):
-            if dims_mapping[idx] == -1:
+            # This is a trick to avoid dims_mapping is []
+            val = dims_mapping[idx] if idx < len(dims_mapping) else -1
+            if val == -1:
                 local_sizes.append(item)
             else:
                 local_sizes.append(item // topology[dims_mapping[idx]])
diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py
index c38953ca9e64d..ab9391cf66fdb 100644
--- a/python/paddle/distributed/auto_parallel/engine.py
+++ b/python/paddle/distributed/auto_parallel/engine.py
@@ -31,10 +31,11 @@
 from paddle.fluid.framework import Operator
 from paddle.fluid.framework import _current_expected_place as _get_device
 from paddle.fluid.dygraph.parallel import ParallelEnv
+from paddle.distributed import fleet
 from paddle.distributed.utils import get_logger
 from paddle.distributed.passes import new_pass, PassContext
 
-from .cluster import Cluster
+# from .cluster import Cluster, get_default_cluster
 from .planner_v2 import Planner
 from .parallelizer_v2 import Parallelizer
 from .dist_op import DistributedOperator
@@ -57,7 +58,11 @@ def __init__(self,
         self.inputs_spec = self._validate_spec(inputs_spec)
         self.labels_spec = self._validate_spec(labels_spec)
         self.cluster = cluster
+        # if self.cluster is None:
+        #     self.cluster = get_default_cluster()
         self.strategy = strategy
+        if self.strategy is None:
+            self.strategy = fleet.DistributedStrategy()
 
         self._executor = None
         self._cur_rank = paddle.distributed.get_rank()
@@ -69,11 +74,11 @@ def __init__(self,
         self._orig_main_prog = fluid.default_main_program()
         self._orig_startup_prog = fluid.default_startup_program()
         self._orig_dist_context = get_default_distributed_context()
+        self._dist_contexts = {}
         self._serial_main_progs = {}
         self._serial_startup_progs = {}
         self._dist_main_progs = defaultdict(dict)  # dist main programs
         self._dist_startup_progs = defaultdict(dict)  # dist startup programs
-        self._dist_contexts = {}
         self._feed_vars = {}
         self._fetch_vars = {}
 
@@ -104,11 +109,17 @@ def prepare(self,
             parallelizer.parallel(self._cur_rank)
         else:
             parallelizer.parallel_all()
-        # Get the distributed main programs and startup programs
+        # Get the current content from the distributed context 
+        self._serial_main_progs[mode] = self._dist_contexts[
+            mode].serial_main_program
+        self._serial_startup_progs[mode] = self._dist_contexts[
+            mode].serial_startup_program
         self._dist_main_progs[mode] = self._dist_contexts[
             mode].dist_main_programs
         self._dist_startup_progs[mode] = self._dist_contexts[
             mode].dist_startup_programs
+        self._feed_vars[mode] = self._dist_contexts[mode].serial_feed_vars
+        self._fetch_vars[mode] = self._dist_contexts[mode].serial_fetch_vars
         # Init comm and startup program
         self._initialize(mode)
 
@@ -135,20 +146,23 @@ def _build(self, mode):
             inputs = [self._set_data_parallel(var) for var in inputs]
             labels = [self._set_data_parallel(var) for var in labels]
 
-        self._feed_vars[mode] = {"inputs": inputs, "labels": labels}
+        # self._feed_vars[mode] = {"inputs": inputs, "labels": labels}
+        feed_vars = {"inputs": inputs, "labels": labels}
 
-        self._fetch_vars[mode] = {
+        # self._fetch_vars[mode] = {
+        #     "outputs": flatten(outputs),
+        #     "loss": losses,
+        #     "metrics": metrics
+        # }
+        fetch_vars = {
             "outputs": flatten(outputs),
             "loss": losses,
             "metrics": metrics
         }
 
-        self._serial_main_progs[mode] = serial_main_prog
-        self._serial_startup_progs[mode] = serial_startup_prog
         self._dist_contexts[mode] = DistributedContext(
-            self._serial_main_progs[mode], self._serial_startup_progs[mode],
-            self._optimizer, losses, self._feed_vars[mode],
-            self._fetch_vars[mode], self.strategy)
+            serial_main_prog, serial_startup_prog, self._optimizer, losses,
+            feed_vars, fetch_vars, self.cluster, self.strategy)
         self._dist_contexts[mode].gradient_scale = self._gradient_scale
 
     def _initialize(self, mode):
diff --git a/python/paddle/distributed/auto_parallel/operators/__init__.py b/python/paddle/distributed/auto_parallel/operators/__init__.py
index 3ff474697205e..295e3557df27d 100644
--- a/python/paddle/distributed/auto_parallel/operators/__init__.py
+++ b/python/paddle/distributed/auto_parallel/operators/__init__.py
@@ -16,7 +16,7 @@
 from .common import DistributedOperatorImpl
 from .common import register_distributed_operator_impl_container
 from .common import register_distributed_operator_impl
-from .common import find_best_compatible_distributed_operator_impl
+from .common import find_compatible_distributed_operator_impls
 from . import dist_embedding
 from . import dist_matmul
 from . import dist_reshape
diff --git a/python/paddle/distributed/auto_parallel/operators/common.py b/python/paddle/distributed/auto_parallel/operators/common.py
index 441eb88a9f1ee..6b3c655f293bd 100644
--- a/python/paddle/distributed/auto_parallel/operators/common.py
+++ b/python/paddle/distributed/auto_parallel/operators/common.py
@@ -157,9 +157,7 @@ def register_distributed_operator_impl(op_type, dist_impl):
         assert False, "Must register distributed operator registry first."
 
 
-def find_best_compatible_distributed_operator_impl(dist_op,
-                                                   fwd=True,
-                                                   partial=True):
+def find_compatible_distributed_operator_impls(dist_op, fwd=True, partial=True):
     """
     Here just return the first compatible implemention. 
     This will be improved by cost model in the future.
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_default.py b/python/paddle/distributed/auto_parallel/operators/dist_default.py
index 6d9b48ea1e87c..e18cee6d42dca 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_default.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_default.py
@@ -187,7 +187,7 @@ def is_auto_compatible(self, dist_op):
         for arg_name in op_desc.input_arg_names():
             serial_tensor = dist_op.get_serial_input(arg_name)
             dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
-            if serial_tensor.is_parameter:
+            if serial_tensor is not None and serial_tensor.is_parameter:
                 for mapping in dims_mapping:
                     if mapping != -1:
                         return False
@@ -217,7 +217,7 @@ def is_auto_compatible(self, dist_op):
         for arg_name in op_desc.output_arg_names():
             serial_tensor = dist_op.get_serial_output(arg_name)
             dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
-            if serial_tensor.is_parameter:
+            if serial_tensor is not None and serial_tensor.is_parameter:
                 for mapping in dims_mapping:
                     if mapping != -1:
                         return False
@@ -363,7 +363,7 @@ def forward(ctx, *args, **kwargs):
                 output_name)
 
         # replicate op in dist program
-        dist_op_desc = main_block.append_op(type='nop').desc
+        dist_op_desc = main_block.desc.append_op()
         dist_op_desc.copy_from(src_op.desc)
         set_dist_op_desc_original_id(dist_op_desc, src_op.desc, ctx)
         for input_name in src_op.desc.input_names():
@@ -371,6 +371,8 @@ def forward(ctx, *args, **kwargs):
         for output_name in src_op.desc.output_names():
             dist_op_desc.set_output(output_name, kwargs[output_name])
 
+        main_block._sync_with_cpp()
+
         # data parallel synchronization for primtive operators
         from paddle.incubate.autograd import prim_enabled
         if prim_enabled():
@@ -426,6 +428,8 @@ def forward(ctx, *args, **kwargs):
                         op_attr.set_input_dims_mapping(param.name, dims_mapping)
                         ctx.set_op_dist_attr_for_program(new_op, op_attr)
 
+                startup_block._sync_with_cpp()
+
     @staticmethod
     def backward(ctx, *args, **kwargs):
 
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py b/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py
index 89cd2c9d9e41a..4d52e5a94beb1 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py
@@ -22,7 +22,6 @@
 from .common import register_distributed_operator_impl
 from .common import set_comm_op_dist_attr_for_program
 from .dist_default import DistributedDefaultImpl0
-from ..reshard import Resharder
 from ..process_group import new_process_group
 from ..utils import is_dim_shard, is_dim_replicate, _get_corresponding_rank
 from ..utils import compute_compatible_dim_mapping, set_dist_op_desc_original_id, _get_comm_group
@@ -324,6 +323,8 @@ def backward(ctx, *args, **kwargs):
         process_mesh_shape = op_dist_attr.process_mesh.topology
         process_mesh_group = op_dist_attr.process_mesh.processes
         dims_mapping = [0] + [-1 for _ in range(len(new_X_grad.shape) - 1)]
+        from ..reshard import Resharder
+
         partition_idx = Resharder.compute_partition_index(
             rank_id, new_X_grad.shape, dims_mapping, process_mesh_shape,
             process_mesh_group)
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_reduce_p.py b/python/paddle/distributed/auto_parallel/operators/dist_reduce_p.py
index 755dcab4be34f..3275bddd9b4cc 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_reduce_p.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_reduce_p.py
@@ -107,13 +107,14 @@ def forward(ctx, *args, **kwargs):
                 output_name)
 
         # replicate op in dist program
-        dist_op_desc = main_block.append_op(type='nop').desc
+        dist_op_desc = main_block.desc.append_op()
         dist_op_desc.copy_from(src_op.desc)
         set_dist_op_desc_original_id(dist_op_desc, src_op.desc, ctx)
         for input_name in src_op.desc.input_names():
             dist_op_desc.set_input(input_name, kwargs[input_name])
         for output_name in src_op.desc.output_names():
             dist_op_desc.set_output(output_name, kwargs[output_name])
+        main_block._sync_with_cpp()
 
         # batch dimension synchronization
         var_name = src_op.output_arg_names[0]
diff --git a/python/paddle/distributed/auto_parallel/parallelizer_v2.py b/python/paddle/distributed/auto_parallel/parallelizer_v2.py
index 6a94bbd3130b9..218513323dffb 100644
--- a/python/paddle/distributed/auto_parallel/parallelizer_v2.py
+++ b/python/paddle/distributed/auto_parallel/parallelizer_v2.py
@@ -35,7 +35,7 @@ def __init__(self, mode, completer, dist_context):
         self._mode = mode
         self._completer = completer
         self._dist_context = dist_context
-        self._dist_context.initialize()
+        assert self._dist_context._is_initialized
         self._pass_context = self._dist_context.pass_context
         self._strategy = self._dist_context.strategy
 
@@ -43,7 +43,9 @@ def parallel_all(self):
         world_process_group = get_world_process_group()
         all_ranks = world_process_group.ranks
         for rank in all_ranks:
+            # self._dist_context._backup(serial=True, dist=True)
             self.parallel(rank)
+            # self._dist_context._restore(serial=True, dist=True)
 
     def parallel(self, rank):
         serial_main_program = self._dist_context.serial_main_program
@@ -51,13 +53,14 @@ def parallel(self, rank):
         serial_optimizer = self._dist_context.serial_optimizer
         if self._mode == "train" and serial_optimizer:
             # Generate backward
-            serial_loss = self._dist_context.serial_fetch_vars["loss"][0]
+            serial_loss = self._dist_context.serial_loss
             params_grads = self._generate_backward(
                 serial_main_program, serial_startup_program, serial_loss)
             # Apply pre optimization passes
             self._apply_pre_optimization(serial_main_program,
                                          serial_startup_program, serial_loss,
                                          serial_optimizer, params_grads)
+
             # Do logical partition
             partitioner = Partitioner(self._dist_context, rank)
             dist_main_prog, dist_startup_prog, dist_params_grads = partitioner.partition(
@@ -85,7 +88,6 @@ def parallel(self, rank):
             resharder = Resharder(dist_main_prog, dist_startup_prog, rank,
                                   self._dist_context, [], 1)
             resharder.reshard()
-
         # Clone program for test
         if self._mode != 'train':
             dist_main_prog = dist_main_prog.clone(for_test=True)
diff --git a/python/paddle/distributed/auto_parallel/partitioner.py b/python/paddle/distributed/auto_parallel/partitioner.py
index 91a31dd1b922e..6a767e5afcdf6 100644
--- a/python/paddle/distributed/auto_parallel/partitioner.py
+++ b/python/paddle/distributed/auto_parallel/partitioner.py
@@ -25,7 +25,7 @@
 from .dist_attribute import OperatorDistributedAttribute
 from .process_group import new_process_group
 from .utils import set_dist_op_desc_original_id
-from .utils import print_program_with_dist_attr, is_forward_op, is_backward_op, is_loss_op
+from .utils import print_program_with_dist_attr, is_forward_op, is_backward_op, is_loss_op, is_optimize_op
 from .operators.common import BACKWARD_ONLY_DIST_OPS
 
 __varname_not_in_block__ = ["lod_tensor_blocking_queue_0"]
@@ -211,7 +211,7 @@ def partition_block(self, ref_block, target_block):
         forward_op_id2forward_op = {}
         for idx in range(len(serial_ops)):
             if idx <= last_fwd_op_idx:
-                forward_op_id2forward_op[serial_ops[idx].desc.id(
+                forward_op_id2forward_op[serial_ops[idx].desc.original_id(
                 )] = serial_ops[idx]
 
         appended_grad_times = 0
@@ -263,14 +263,14 @@ def partition_block(self, ref_block, target_block):
                 dist_op_backward_impl.backward(
                     self._dist_context, **kinputs, **koutputs,
                     **{"grad_var_to_var": grad_var_to_var})
-            elif int(op.attr('op_role')) == 2:
+            elif is_optimize_op(op):
                 kinputs, koutputs = dist_op_context.prepare_context(op)
                 dist_op_impl = get_distributed_operator_impl_container(
                     "default").get_impl(0)
                 dist_op_impl.backward(self._dist_context, **kinputs, **koutputs)
             else:
                 raise NotImplementedError(
-                    "partitioner only support forward op and backward op, but got {}".
+                    "partitioner only support forward and backward, optimize ops, but got {}".
                     format(str(op)))
 
     def _is_valid_annotated_program(self, program):
@@ -408,9 +408,9 @@ def _partition_var(dist_context, src_block, dst_block, src_varname,
 def _get_dist_op_backward_implement(backward_op, dist_context,
                                     forward_op_id2forward_op):
     dist_op_context = dist_context.dist_op_context
-    if backward_op.desc.id() in dist_op_context.grad_op_id_to_op_id:
-        forward_op_id = dist_op_context.grad_op_id_to_op_id[backward_op.desc.id(
-        )]
+    if backward_op.desc.original_id() in dist_op_context.grad_op_id_to_op_id:
+        forward_op_id = dist_op_context.grad_op_id_to_op_id[
+            backward_op.desc.original_id()]
         forward_op = forward_op_id2forward_op[forward_op_id]
         forward_op_dist_attr = dist_context.get_op_dist_attr_for_program(
             forward_op)
diff --git a/python/paddle/distributed/auto_parallel/planner_v2.py b/python/paddle/distributed/auto_parallel/planner_v2.py
index 7db17e98d07ee..3625a25d74e0e 100755
--- a/python/paddle/distributed/auto_parallel/planner_v2.py
+++ b/python/paddle/distributed/auto_parallel/planner_v2.py
@@ -16,6 +16,8 @@
 from .dist_context import get_default_distributed_context
 from .utils import print_program_with_dist_attr
 
+# from .tuner.parallel_tuner import ParallelTuner
+
 
 class Planner:
     def __init__(self, mode, dist_context):
@@ -24,19 +26,28 @@ def __init__(self, mode, dist_context):
 
         # NOTE: [HighOrderGrad]. There are grad ops in forward phase, and it need
         # dependency of backward-forward ops in forward completion.
+        # TODO: The id mapping will be lost if we clone the original program.
         default_ctx = get_default_distributed_context()
         self._dist_context._dist_op_context = default_ctx.dist_op_context
         self._dist_context.initialize()
 
         self._completer = Completer(self._dist_context)
 
+        self._strategy = dist_context.strategy
+        # if self._strategy.auto_search:
+        #     self._parallel_tuner = ParallelTuner(
+        #         self._dist_context, mode=self._mode)
+
     @property
     def completer(self):
         return self._completer
 
     def plan(self):
         self._completer.complete_forward_annotation()
+        # if self._strategy.auto_search:
+        #     self._parallel_tuner.tune()
+        # else:
+        #     self._completer.complete_forward_annotation()
         # parse forward sub block
         self._dist_context.block_state.parse_forward_blocks(
             self._dist_context.serial_main_program)
-        # TODO: add the auto searcher
diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py
index fbe3a43a7917a..7b198e288c636 100644
--- a/python/paddle/distributed/auto_parallel/utils.py
+++ b/python/paddle/distributed/auto_parallel/utils.py
@@ -324,10 +324,13 @@ def _get_corresponding_rank(dist_context, target_mesh, rank):
                                                 mesh.processes.index(rank))
             break
 
-    assert coordinate is not None, "could NOT found rank [{}] in any registered mesh".format(
-        rank)
-    return target_mesh.processes[_coordinate2linear_idx(mesh.topology,
-                                                        coordinate)]
+    # assert coordinate is not None, "could NOT found rank [{}] in any registered mesh".format(
+    #     rank)
+    if coordinate is not None:
+        return target_mesh.processes[_coordinate2linear_idx(mesh.topology,
+                                                            coordinate)]
+    else:
+        return target_mesh.processes[0]
 
 
 def _get_unshard_dist_shape(var, dist_attr):
@@ -1096,6 +1099,11 @@ def is_backward_op(op):
             int(op.all_attrs()[OP_ROLE_KEY]) & int(OpRole.Backward)
 
 
+def is_optimize_op(op):
+    return OP_ROLE_KEY in op.attr_names and \
+            int(op.all_attrs()[OP_ROLE_KEY]) & int(OpRole.Optimize)
+
+
 def is_loss_op(op):
     return OP_ROLE_KEY in op.attr_names and \
         int(op.all_attrs()[OP_ROLE_KEY]) == (int(core.op_proto_and_checker_maker.OpRole.Forward) | int(core.op_proto_and_checker_maker.OpRole.Loss))
diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
index d0b5c915e11cd..5e2ad43c16431 100644
--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
@@ -140,17 +140,12 @@ def broadcast_dp_parameters(model, hcg):
 
 
 def fused_allreduce_gradients(parameter_list, hcg):
-    if _in_legacy_dygraph():
-        data_parallel_group = None if hcg is None else hcg.get_data_parallel_group(
-        )
-        logger.debug("dp start fuse allreduce gradients")
-        with framework.no_grad():
-            _apply_collective_grads(parameter_list, data_parallel_group)
-    elif in_dygraph_mode():
-        assert hcg is None, "It's not support to use hcg in EagerDygraph now."
-        data_parallel_group = paddle.distributed.collective._get_default_group()
-        with framework.no_grad():
-            _apply_collective_grads_eager(parameter_list, data_parallel_group)
+    data_parallel_group = None if hcg is None else hcg.get_data_parallel_group()
+    logger.debug("dp start fuse allreduce gradients")
+    apply_func = _apply_collective_grads_eager if in_dygraph_mode(
+    ) else _apply_collective_grads
+    with framework.no_grad():
+        apply_func(parameter_list, data_parallel_group)
 
 
 def sharding_reduce_gradients(parameter_list, hcg):
diff --git a/python/paddle/distributed/passes/auto_parallel_amp.py b/python/paddle/distributed/passes/auto_parallel_amp.py
index fe94c25e12d2d..3cd04affa29c2 100644
--- a/python/paddle/distributed/passes/auto_parallel_amp.py
+++ b/python/paddle/distributed/passes/auto_parallel_amp.py
@@ -46,13 +46,13 @@ def _build_stats(self, amp_lists, dist_context):
             if int(op.attr('op_role')) == int(OpRole.Forward):
                 self._mark_black_white_ops(amp_lists)
             elif int(op.attr('op_role')) == int(OpRole.Backward):
-                if op.desc.id() in dist_op_context.grad_op_id_to_op_id:
-                    fwd_op_id = dist_op_context.grad_op_id_to_op_id[op.desc.id(
-                    )]
+                if op.desc.original_id() in dist_op_context.grad_op_id_to_op_id:
+                    fwd_op_id = dist_op_context.grad_op_id_to_op_id[
+                        op.desc.original_id()]
                     if self._is_fp16_op(fwd_op_id) == True:
-                        self._op_fp16_dict[op.desc.id()] = True
+                        self._op_fp16_dict[op.desc.original_id()] = True
                     elif self._is_fp16_op(fwd_op_id) == False:
-                        self._op_fp16_dict[op.desc.id()] = False
+                        self._op_fp16_dict[op.desc.original_id()] = False
             elif int(op.attr('op_role')) == int(OpRole.Optimize):
                 break
 
@@ -70,12 +70,12 @@ def _mark_black_white_ops(self, amp_lists):
                 continue
             if amp_lists.black_varnames is not None and _is_in_black_varnames(
                     op, amp_lists):
-                self._op_fp16_dict[op.desc.id()] = False
+                self._op_fp16_dict[op.desc.original_id()] = False
                 continue
             if op.type in amp_lists.black_list:
-                self._op_fp16_dict[op.desc.id()] = False
+                self._op_fp16_dict[op.desc.original_id()] = False
             elif op.type in amp_lists.white_list:
-                self._op_fp16_dict[op.desc.id()] = True
+                self._op_fp16_dict[op.desc.original_id()] = True
             elif op.type in amp_lists.gray_list:
                 is_black_op = False
                 is_white_op = False
@@ -95,22 +95,22 @@ def _mark_black_white_ops(self, amp_lists):
                             else:
                                 prev_op = in_var.op
                             # if it's one of inputs
-                            if self._is_fp16_op(prev_op.desc.id()) == False or \
+                            if self._is_fp16_op(prev_op.desc.original_id()) == False or \
                                     prev_op.type in amp_lists.black_list:
                                 is_black_op = True
-                            elif self._is_fp16_op(prev_op.desc.id()) == True or \
+                            elif self._is_fp16_op(prev_op.desc.original_id()) == True or \
                                     prev_op.type in amp_lists.white_list:
                                 is_white_op = True
                 if is_black_op:
-                    self._op_fp16_dict[op.desc.id()] = False
+                    self._op_fp16_dict[op.desc.original_id()] = False
                 elif is_white_op:
-                    self._op_fp16_dict[op.desc.id()] = True
+                    self._op_fp16_dict[op.desc.original_id()] = True
                 else:
                     pass
             else:
                 # For numerical safe, we apply fp32 computation on ops that
                 # are not determined which list they should stay.
-                self._op_fp16_dict[op.desc.id()] = False
+                self._op_fp16_dict[op.desc.original_id()] = False
 
     def cast_forward_program(self, dist_context):
         ops = self._block.ops
@@ -120,11 +120,11 @@ def cast_forward_program(self, dist_context):
             num_cast_ops = 0
             if int(op.attr('op_role')) == int(OpRole.Backward):
                 break
-            if self._is_fp16_op(op.desc.id()) == False:
+            if self._is_fp16_op(op.desc.original_id()) == False:
                 num_cast_ops = self._insert_cast_op_forward(
                     op, idx, core.VarDesc.VarType.FP16,
                     core.VarDesc.VarType.FP32, dist_context)
-            elif self._is_fp16_op(op.desc.id()) == True:
+            elif self._is_fp16_op(op.desc.original_id()) == True:
                 num_cast_ops = self._insert_cast_op_forward(
                     op, idx, core.VarDesc.VarType.FP32,
                     core.VarDesc.VarType.FP16, dist_context)
@@ -198,7 +198,7 @@ def _insert_cast_op_forward(self, op, idx, src_dtype, dst_dtype,
                 else:
                     if op.has_attr('in_dtype'):
                         op._set_attr('in_dtype', dst_dtype)
-        self._var_name_dict[op.desc.id()] = var_name_dict
+        self._var_name_dict[op.desc.original_id()] = var_name_dict
 
         if src_dtype == core.VarDesc.VarType.FP32 and dst_dtype == core.VarDesc.VarType.FP16:
             for out_name in op.output_names:
@@ -225,13 +225,14 @@ def cast_backward_program(self, params_grads, dist_context):
         while idx < len(ops):
             num_cast_ops = 0
             grad_op = ops[idx]
+            grad_op_orig_id = grad_op.desc.original_id()
             dist_op_context = dist_context.dist_op_context
-            if grad_op.desc.id() in dist_op_context.grad_op_id_to_op_id:
-                if self._is_fp16_op(grad_op.desc.id()) == False:  # fp32
+            if grad_op_orig_id in dist_op_context.grad_op_id_to_op_id:
+                if self._is_fp16_op(grad_op_orig_id) == False:  # fp32
                     num_cast_ops = self._insert_cast_op_backward(
                         grad_op, idx, core.VarDesc.VarType.FP16,
                         core.VarDesc.VarType.FP32, dist_context)
-                elif self._is_fp16_op(grad_op.desc.id()) == True:  # fp16
+                elif self._is_fp16_op(grad_op_orig_id) == True:  # fp16
                     num_cast_ops = self._insert_cast_op_backward(
                         grad_op, idx, core.VarDesc.VarType.FP32,
                         core.VarDesc.VarType.FP16, dist_context)
@@ -272,8 +273,9 @@ def _keep_fp32_output(op, out_name):
             return False
 
         num_cast_ops = 0
+        original_id = grad_op.desc.original_id()
         dist_op_context = dist_context.dist_op_context
-        fwd_op_id = dist_op_context.grad_op_id_to_op_id[grad_op.desc.id()]
+        fwd_op_id = dist_op_context.grad_op_id_to_op_id[original_id]
 
         for in_name in grad_op.input_names:
             if src_dtype == core.VarDesc.VarType.FP32 and _keep_fp32_input(
diff --git a/python/paddle/distributed/passes/auto_parallel_fp16.py b/python/paddle/distributed/passes/auto_parallel_fp16.py
index 9dda310e5c022..b01f3975aefdd 100644
--- a/python/paddle/distributed/passes/auto_parallel_fp16.py
+++ b/python/paddle/distributed/passes/auto_parallel_fp16.py
@@ -153,23 +153,24 @@ def _mark_op(self, op):
 
             # ernie inference trick
             if op.type == "assign" and "array_" in op.input_arg_names[0]:
-                self._op_fp16_dict[op.desc.id()] = False
+                self._op_fp16_dict[op.desc.original_id()] = False
                 return
             if _need_keep_fp32(op, self.amp_list.unsupported_list,
                                self.use_fp16_guard):
-                self._op_fp16_dict[op.desc.id()] = False
+                self._op_fp16_dict[op.desc.original_id()] = False
             else:
-                self._op_fp16_dict[op.desc.id()] = True
+                self._op_fp16_dict[op.desc.original_id()] = True
             for var_name in op.output_arg_names:
                 # assert var_name not in self.forward_non_leaf_tensors, "{}".format(var_name)
                 self.forward_non_leaf_tensors[var_name] = op.desc.id()
 
         elif is_backward_op(op) == int(OpRole.Backward):
 
-            if op.desc.id() in self.grad_op_to_op_map:
-                fwd_op_id = self.grad_op_to_op_map[op.desc.id()]
+            if op.desc.original_id() in self.grad_op_to_op_map:
+                fwd_op_id = self.grad_op_to_op_map[op.desc.original_id()]
                 assert fwd_op_id in self._op_fp16_dict, "{}".format(str(op))
-                self._op_fp16_dict[op.desc.id()] = self._op_fp16_dict[fwd_op_id]
+                self._op_fp16_dict[op.desc.original_id()] = self._op_fp16_dict[
+                    fwd_op_id]
 
         if int(op.attr('op_role')) == 257:
             self.is_train = True
@@ -192,10 +193,10 @@ def set_var_to_fp16(self, var_name, block):
     def resolute_tensor_dtype(self, block):
 
         for op in block.ops:
-            op_id = op.desc.id()
             if is_forward_op(op):
                 # NOTE (JZ-LIANG) un-expected cast op when user call "+, -, *, /" in python
-                if self._is_fp16_op(op_id) == True or op.type == "cast":
+                if self._is_fp16_op(op.desc.original_id()) == True \
+                    or op.type == "cast":
                     for in_name in op.input_names:
                         if _keep_fp32_input(op, in_name):
                             continue
@@ -209,7 +210,7 @@ def resolute_tensor_dtype(self, block):
                             self.set_var_to_fp16(out_var_name, block)
                     set_op_dtype_to_fp16(op)
                 # NOTE (JZ-LIANG) un-expected cast op when user call "+, -, *, /" in python
-                elif self._is_fp16_op(op_id) == False:
+                elif self._is_fp16_op(op.desc.original_id()) == False:
                     for out_var_name in op.output_arg_names:
                         out_var = block.vars.get(out_var_name)
                         if out_var is None or out_var.type not in _valid_types:
@@ -217,7 +218,7 @@ def resolute_tensor_dtype(self, block):
                         if out_var.dtype == core.VarDesc.VarType.FP16:
                             out_var.desc.set_dtype(core.VarDesc.VarType.FP32)
             elif is_backward_op(op):
-                if self._is_fp16_op(op_id) == True:
+                if self._is_fp16_op(op.desc.original_id()) == True:
                     for out_name in op.output_names:
                         if _keep_fp32_output(op, out_name):
                             continue
@@ -225,7 +226,7 @@ def resolute_tensor_dtype(self, block):
                             self.set_var_to_fp16(out_var_name, block)
                     set_op_dtype_to_fp16(op)
                 # NOTE (JZ-LIANG) un-expected cast op when user call "+, -, *, /" in python
-                elif self._is_fp16_op(op_id) == False:
+                elif self._is_fp16_op(op.desc.original_id()) == False:
                     for out_var_name in op.output_arg_names:
                         out_var = block.vars.get(out_var_name)
                         if out_var is None or out_var.type not in _valid_types:
@@ -238,28 +239,27 @@ def cast_block(self, block):
         idx = 0
         while idx < len(block.ops):
             op = block.ops[idx]
-            op_id = op.desc.id()
             num_cast_ops = 0
 
             if op.type in __amp_skip_ops__:
                 idx += 1
                 continue
             elif is_forward_op(op):
-                if self._is_fp16_op(op_id) == False:
+                if self._is_fp16_op(op.desc.original_id()) == False:
                     num_cast_ops = self._insert_forward_cast_ops(
                         op, idx, block, core.VarDesc.VarType.FP16,
                         core.VarDesc.VarType.FP32, self.dist_context)
-                elif self._is_fp16_op(op_id) == True:
+                elif self._is_fp16_op(op.desc.original_id()) == True:
                     num_cast_ops = self._insert_forward_cast_ops(
                         op, idx, block, core.VarDesc.VarType.FP32,
                         core.VarDesc.VarType.FP16, self.dist_context)
             elif is_backward_op(op):
-                if op_id in dist_op_context.grad_op_id_to_op_id:
-                    if self._is_fp16_op(op_id) == False:
+                if op.desc.original_id() in dist_op_context.grad_op_id_to_op_id:
+                    if self._is_fp16_op(op.desc.original_id()) == False:
                         num_cast_ops = self._insert_backward_cast_ops(
                             op, idx, block, core.VarDesc.VarType.FP16,
                             core.VarDesc.VarType.FP32, self.dist_context)
-                    elif self._is_fp16_op(op_id) == True:
+                    elif self._is_fp16_op(op.desc.original_id()) == True:
                         num_cast_ops = self._insert_backward_cast_ops(
                             op, idx, block, core.VarDesc.VarType.FP32,
                             core.VarDesc.VarType.FP16, self.dist_context)
@@ -282,7 +282,6 @@ def _insert_forward_cast_ops(self, op, idx, block, src_dtype, dst_dtype,
                                  dist_context):
 
         num_cast_ops = 0
-        op_id = op.desc.id()
 
         for in_name in op.input_names:
             if src_dtype == core.VarDesc.VarType.FP32 and _keep_fp32_input(
@@ -300,7 +299,7 @@ def _insert_forward_cast_ops(self, op, idx, block, src_dtype, dst_dtype,
                     cast_name = in_var.name + '.cast_' + _dtype_to_str(
                         dst_dtype)
                     cast_var = block.vars.get(cast_name)
-                    self.forward_input_cast_ops[op_id] += [(
+                    self.forward_input_cast_ops[op.desc.original_id()] += [(
                         cast_name, in_var.name, dst_dtype, src_dtype, in_name)]
 
                     in_var_dist_attr = consume_op_attr.get_input_dist_attr(
@@ -349,8 +348,9 @@ def _insert_backward_cast_ops(self, op, idx, block, src_dtype, dst_dtype,
 
         num_cast_ops = 0
         op_id = op.desc.id()
+        original_id = op.desc.original_id()
         dist_op_context = dist_context.dist_op_context
-        forward_op_id = dist_op_context.grad_op_id_to_op_id[op_id]
+        forward_op_id = dist_op_context.grad_op_id_to_op_id[original_id]
 
         grad_op_attr = dist_context.get_op_dist_attr_for_program(op)
         assert grad_op_attr is not None
diff --git a/python/paddle/distributed/passes/auto_parallel_recompute.py b/python/paddle/distributed/passes/auto_parallel_recompute.py
index 258f46304d189..c6d1685446277 100644
--- a/python/paddle/distributed/passes/auto_parallel_recompute.py
+++ b/python/paddle/distributed/passes/auto_parallel_recompute.py
@@ -315,7 +315,7 @@ def _apply_single_impl(self, main_programs, startup_programs, context):
             # When traversing all grad_ops in reverse, need to set a flag to indicate 
             # whether the ckpt and its segment_descs can be used.
             ckpt_op = op_path[segment[1] - 1]
-            ckpt_ops_dict[ckpt_op.desc.id()] = [True, segment_descs]
+            ckpt_ops_dict[ckpt_op.desc.original_id()] = [True, segment_descs]
 
         # step 4: insert recomputed fwd ops
         ops = main_block.ops
@@ -339,9 +339,9 @@ def _apply_single_impl(self, main_programs, startup_programs, context):
                 _rename_arg_([grad_op.desc], key, var_name_dict[key])
 
             # insert recomputed ops
-            if grad_op.desc.id() in dist_op_context.grad_op_id_to_op_id:
-                fwd_op_id = dist_op_context.grad_op_id_to_op_id[grad_op.desc.id(
-                )]
+            original_id = grad_op.desc.original_id()
+            if original_id in dist_op_context.grad_op_id_to_op_id:
+                fwd_op_id = dist_op_context.grad_op_id_to_op_id[original_id]
                 if fwd_op_id in ckpt_ops_dict and ckpt_ops_dict[fwd_op_id][0]:
                     idx = grad_op.idx
                     while idx - 1 >= 0 and ops[idx - 1].type == "sum":
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 145ecc83cfc26..ed3e0bc98ed6d 100755
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -1107,8 +1107,10 @@ def update_distop_context(distop_context, op_grad_to_var,
         distop_context.grad_var_to_var[appending_grad_times].update(
             op_grad_to_var)
         for op_desc in grad_op_desc:
-            assert op_desc.id() not in distop_context.grad_op_id_to_op_id
-            distop_context.grad_op_id_to_op_id[op_desc.id()] = op.desc.id()
+            assert op_desc.original_id(
+            ) not in distop_context.grad_op_id_to_op_id
+            distop_context.grad_op_id_to_op_id[op_desc.original_id(
+            )] = op.desc.original_id()
 
     if callbacks is not None:
         assert (isinstance(callbacks, (list, tuple)))
@@ -1255,12 +1257,6 @@ def update_distop_context(distop_context, op_grad_to_var,
     for op_desc in grad_op_descs:
         new_op_desc = target_block.desc.append_op()
         new_op_desc.copy_from(op_desc)
-        # Rebuild the mapping because new_op_desc has a differnt id (Only for auto parallel)
-        if distop_context is not None:
-            if op_desc.id() in distop_context.grad_op_id_to_op_id:
-                distop_context.grad_op_id_to_op_id[new_op_desc.id(
-                )] = distop_context.grad_op_id_to_op_id[op_desc.id()]
-                distop_context.grad_op_id_to_op_id.pop(op_desc.id())
         new_op_desc._set_attr(op_role_attr_name, backward)
         grad_to_var["__current_op_desc__"] = new_op_desc
         if callbacks is not None:
diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
index e543bc1e17b2c..348d914943521 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
@@ -376,13 +376,6 @@ def _update_activations(self, graph):
                 activation = ""
                 if op.op().has_attr("fuse_relu") and op.op().attr("fuse_relu"):
                     activation = "relu"
-                elif op.op().has_attr("fuse_brelu") and op.op().attr(
-                        "fuse_brelu"):
-                    activation = "relu6"
-                    alpha = 6.0
-                    if op.op().has_attr("fuse_brelu_threshold"):
-                        alpha = op.op().attr("fuse_brelu_threshold")
-                    op.set_attr("fuse_alpha", alpha)
                 op.set_attr("fuse_activation", activation)
         return graph
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
index f0dae081dd48f..04e1decd4af68 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
@@ -177,8 +177,7 @@ def prepare_program_conv2d(self, program):
                 'dilations': self.dilations,
                 'use_cudnn': self.use_cudnn,
                 'use_mkldnn': self.use_mkldnn,
-                'data_format': self.data_format,
-                'fuse_brelu': True
+                'data_format': self.data_format
             })
 
     def remove_fuse_activation_attribute(self, graph):
@@ -196,9 +195,6 @@ def check_graph_after_pass(self, graph):
                 self.assertTrue(op.op().has_attr("fuse_activation"))
                 if op.op().has_attr("fuse_relu") and op.op().attr("fuse_relu"):
                     self.assertTrue(op.op().attr("fuse_activation") == "relu")
-                if op.op().has_attr("fuse_brelu") and op.op().attr(
-                        "fuse_brelu"):
-                    self.assertTrue(op.op().attr("fuse_activation") == "relu6")
 
     def test_quant_update_activation(self):
         program = fluid.Program()
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index 2efb6965085de..eac2941c09778 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -61,7 +61,7 @@ class FunctionCache(object):
 
     def __init__(self):
         # Caches the converted static functions. {dygraph_func: static_func}
-        self._converted_static_func_caches = dict()
+        self._converted_static_func_caches = weakref.WeakKeyDictionary()
         # Caches the converted ast node for same source code. {source_code: ast_root}
         self._code_to_ast_caches = dict()
         self._dygraph_to_static = DygraphToStaticAst()
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index a3310f1a46ce4..72114a275156d 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -1137,7 +1137,11 @@ def __init__(self,
             self.bias = None
 
     def forward(self, input):
-        if _non_static_mode():
+        if in_dygraph_mode():
+            out, _, _, = _C_ops.final_state_instance_norm(
+                input, self.scale, self.bias, self._epsilon)
+            return out
+        if _in_legacy_dygraph():
             out, _, _ = _C_ops.instance_norm(input, self.scale, self.bias,
                                              'epsilon', self._epsilon)
             return out
@@ -3016,9 +3020,15 @@ def __init__(self,
             is_bias=True)
 
     def forward(self, input):
-        if in_dygraph_mode():
+        mean_out = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+        variance_out = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+
+        if _non_static_mode():
             attrs = ('epsilon', self._epsilon, 'groups', self._groups)
-            out, _, _ = _C_ops.group_norm(input, self.weight, self.bias, *attrs)
+            out, _, _ = _C_ops.group_norm(input, self.weight, self.bias,
+                                          mean_out, variance_out, *attrs)
 
             return dygraph_utils._append_activation_in_dygraph(out, self._act)
         else:
@@ -3029,10 +3039,6 @@ def forward(self, input):
                 inputs['Scale'] = self.weight
 
             # create output
-            mean_out = self._helper.create_variable_for_type_inference(
-                dtype=self._dtype, stop_gradient=True)
-            variance_out = self._helper.create_variable_for_type_inference(
-                dtype=self._dtype, stop_gradient=True)
             group_norm_out = self._helper.create_variable_for_type_inference(
                 dtype=self._dtype)
 
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index add3d73efc7e1..d6b50249df0bc 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -902,7 +902,7 @@ def values(self):
                     indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]]
                     values = [1, 2, 3, 4, 5]
                     dense_shape = [3, 4]
-                    sparse_x = paddle.sparse.sparse_coo_tensor(paddle.to_tensor(indices, dtype='int32'), paddle.to_tensor(values, dtype='float32'), shape=dense_shape)
+                    sparse_x = paddle.incubate.sparse.sparse_coo_tensor(paddle.to_tensor(indices, dtype='int32'), paddle.to_tensor(values, dtype='float32'), shape=dense_shape)
                     print(sparse_x.values())
                     #[1, 2, 3, 4, 5]
         """
@@ -932,7 +932,7 @@ def to_dense(self):
                     indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]]
                     values = [1, 2, 3, 4, 5]
                     dense_shape = [3, 4]
-                    sparse_x = paddle.sparse.sparse_coo_tensor(paddle.to_tensor(indices, dtype='int64'), paddle.to_tensor(values, dtype='float32'), shape=dense_shape)
+                    sparse_x = paddle.incubate.sparse.sparse_coo_tensor(paddle.to_tensor(indices, dtype='int64'), paddle.to_tensor(values, dtype='float32'), shape=dense_shape)
                     dense_x = sparse_x.to_dense()
                     #[[0., 1., 0., 2.],
                     # [0., 0., 3., 0.],
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 757b1a2da95b9..bd453b3ddaa00 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -3600,6 +3600,10 @@ def append_op(self, *args, **kwargs):
             attrs = kwargs.get("attrs", {})
             inplace_map = kwargs.get("inplace_map", None)
             type = kwargs.get("type", None)
+            warnings.warn(
+                "Op `%s` is executed through `append_op` under the dynamic mode, "
+                "the corresponding API implementation needs to be upgraded to "
+                "using `_C_ops` method." % type, DeprecationWarning)
             op = Operator(
                 block=self,
                 desc=None,
diff --git a/python/paddle/fluid/ir.py b/python/paddle/fluid/ir.py
index 55297ed516ffb..2756eac990ed3 100644
--- a/python/paddle/fluid/ir.py
+++ b/python/paddle/fluid/ir.py
@@ -101,6 +101,9 @@ def apply_pass(name):
     if build_strategy.enable_auto_fusion and use_cuda:
         apply_pass("fusion_group_pass")
         build_strategy.enable_auto_fusion = False
+    if build_strategy.fuse_gemm_epilogue:
+        apply_pass("fuse_gemm_epilogue_pass")
+        build_strategy.fuse_gemm_epilogue = False
     if build_strategy.fuse_elewise_add_act_ops:
         apply_pass("fuse_elewise_add_act_pass")
         build_strategy.fuse_elewise_add_act_ops = False
diff --git a/python/paddle/fluid/layers/loss.py b/python/paddle/fluid/layers/loss.py
index b78865a0ece4e..99c0a2e70b771 100644
--- a/python/paddle/fluid/layers/loss.py
+++ b/python/paddle/fluid/layers/loss.py
@@ -336,28 +336,7 @@ def square_error_cost(input, label):
             # [0.01, 0.01]
 
     """
-    if _non_static_mode():
-        minus_out = _C_ops.elementwise_sub(input, label)
-        square_out = _C_ops.square(minus_out)
-        return square_out
-
-    check_variable_and_dtype(input, "input", ['float32', 'float64'],
-                             'square_error_cost')
-    check_variable_and_dtype(label, "label", ['float32', 'float64'],
-                             'square_error_cost')
-    helper = LayerHelper('square_error_cost', **locals())
-    minus_out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type='elementwise_sub',
-        inputs={'X': [input],
-                'Y': [label]},
-        outputs={'Out': [minus_out]})
-
-    square_out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type='square', inputs={'X': [minus_out]},
-        outputs={'Out': [square_out]})
-    return square_out
+    return paddle.nn.functional.square_error_cost(input, label)
 
 
 def edit_distance(input,
@@ -433,45 +412,8 @@ def edit_distance(input,
             # [4]
 
     """
-    check_variable_and_dtype(input, 'input', ['int64'], 'edit_distance')
-    check_variable_and_dtype(label, 'label', ['int64'], 'edit_distance')
-    helper = LayerHelper("edit_distance", **locals())
-
-    # remove some tokens from input and labels
-    if ignored_tokens is not None and len(ignored_tokens) > 0:
-        erased_input = helper.create_variable_for_type_inference(dtype="int64")
-        erased_label = helper.create_variable_for_type_inference(dtype="int64")
-
-        helper.append_op(
-            type="sequence_erase",
-            inputs={"X": [input]},
-            outputs={"Out": [erased_input]},
-            attrs={"tokens": ignored_tokens})
-        input = erased_input
-
-        helper.append_op(
-            type="sequence_erase",
-            inputs={"X": [label]},
-            outputs={"Out": [erased_label]},
-            attrs={"tokens": ignored_tokens})
-        label = erased_label
-
-    this_inputs = {"Hyps": [input], "Refs": [label]}
-    if input_length is not None and label_length is not None:
-        this_inputs['HypsLength'] = [input_length]
-        this_inputs['RefsLength'] = [label_length]
-
-    # edit distance op
-    edit_distance_out = helper.create_variable_for_type_inference(dtype="int64")
-    sequence_num = helper.create_variable_for_type_inference(dtype="int64")
-    helper.append_op(
-        type="edit_distance",
-        inputs=this_inputs,
-        outputs={"Out": [edit_distance_out],
-                 "SequenceNum": [sequence_num]},
-        attrs={"normalized": normalized})
-
-    return edit_distance_out, sequence_num
+    return paddle.nn.functional.loss.edit_distance(
+        input, label, normalized, ignored_tokens, input_length, label_length)
 
 
 def warpctc(input,
@@ -1279,52 +1221,9 @@ def softmax_with_cross_entropy(logits,
             out = paddle.nn.functional.softmax_with_cross_entropy(logits=x, label=label)
             print(out)
     """
-    if _non_static_mode():
-        if core.is_compiled_with_npu():
-            softmax, backprop, loss = _C_ops.softmax_with_cross_entropy(
-                logits, label, 'soft_label', soft_label, 'ignore_index',
-                ignore_index, 'numeric_stable_mode', numeric_stable_mode,
-                'axis', axis)
-        else:
-            if in_dygraph_mode():
-                softmax, loss = _C_ops.final_state_cross_entropy_with_softmax(
-                    logits, label, soft_label, True, numeric_stable_mode,
-                    ignore_index, axis)
-            if _in_legacy_dygraph():
-                softmax, loss = _C_ops.softmax_with_cross_entropy(
-                    logits, label, 'soft_label', soft_label, 'ignore_index',
-                    ignore_index, 'numeric_stable_mode', numeric_stable_mode,
-                    'axis', axis)
-        if not return_softmax:
-            return loss
-        else:
-            return loss, softmax
-
-    attrs = {
-        'soft_label': soft_label,
-        'ignore_index': ignore_index,
-        'numeric_stable_mode': numeric_stable_mode,
-        'axis': axis
-    }
-    helper = LayerHelper('softmax_with_cross_entropy', **locals())
-    softmax = helper.create_variable_for_type_inference(dtype=logits.dtype)
-    loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
-
-    outputs = {'Softmax': softmax, 'Loss': loss}
-    if core.is_compiled_with_npu() or core.is_compiled_with_mlu():
-        backprop = helper.create_variable_for_type_inference(dtype=logits.dtype)
-        outputs['Backprop'] = backprop
-    helper.append_op(
-        type='softmax_with_cross_entropy',
-        inputs={'Logits': logits,
-                'Label': label},
-        outputs=outputs,
-        attrs=attrs)
-
-    if return_softmax:
-        return loss, softmax
-
-    return loss
+    return paddle.nn.functional.loss.fluid_softmax_with_cross_entropy(
+        logits, label, soft_label, ignore_index, numeric_stable_mode,
+        return_softmax, axis)
 
 
 def rank_loss(label, left, right, name=None):
@@ -1733,33 +1632,7 @@ def npair_loss(anchor, positive, labels, l2_reg=0.002):
           print(npair_loss)
   
     """
-    check_variable_and_dtype(anchor, 'anchor', ['float32', 'float64'],
-                             'npair_loss')
-    check_variable_and_dtype(positive, 'positive', ['float32', 'float64'],
-                             'positive')
-    check_variable_and_dtype(labels, 'labels', ['float32', 'float64', 'int64'],
-                             'labels')
-    Beta = 0.25
-    batch_size = labels.shape[0]
-
-    labels = nn.reshape(labels, shape=[batch_size, 1])
-    labels = paddle.tile(labels, repeat_times=[1, batch_size])
-
-    labels = equal(labels, nn.transpose(labels, perm=[1, 0])).astype('float32')
-    labels = labels / nn.reduce_sum(labels, dim=1, keep_dim=True)
-
-    l2loss = nn.reduce_mean(nn.reduce_sum(square(anchor), 1)) \
-             + nn.reduce_mean(nn.reduce_sum(square(positive), 1))
-    l2loss = l2loss * Beta * l2_reg
-
-    similarity_matrix = paddle.matmul(
-        anchor, positive, transpose_x=False, transpose_y=True)
-    softmax_ce = softmax_with_cross_entropy(
-        logits=similarity_matrix, label=labels, soft_label=True)
-    cross_entropy = nn.reduce_sum(labels * softmax_ce, 0)
-    celoss = nn.reduce_mean(cross_entropy)
-
-    return l2loss + celoss
+    return paddle.nn.functional.npair_loss(anchor, positive, labels, l2_reg)
 
 
 def mse_loss(input, label):
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 97506ead5fad4..7fb9f6057b55a 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -7394,30 +7394,8 @@ def dice_loss(input, label, epsilon=0.00001, name=None):
             predictions = F.softmax(x)
             loss = F.dice_loss(input=predictions, label=label)
     """
-    assert input.dtype in (paddle.float32, paddle.float64)
-    assert label.dtype in (paddle.int32, paddle.int64)
-    assert len(input.shape) >= 2, \
-        "The rank of input should be greater than or equal to 2."
-    assert len(input.shape) == len(label.shape), (
-        "The rank of input and label should be equal, "
-        "but received input: %d, label: %d." %
-        (len(input.shape), len(label.shape)))
-    assert label.shape[-1] == 1, ("The last dimension of label should be 1, "
-                                  "but received %d." % label.shape[-1])
-    assert input.shape[:-1] == label.shape[:-1], (
-        "All dimensions should be equal except the last one.")
-    assert input.numel() > 0 and label.numel() > 0, \
-        "Any dimension of input and label cannot be equal to 0."
-
-    label = squeeze(label, [-1])
-    label = paddle.nn.functional.one_hot(label, input.shape[-1])
-    reduce_dim = list(range(1, len(input.shape)))
-    inse = reduce_sum(input * label, dim=reduce_dim)
-    dice_denominator = reduce_sum(
-        input, dim=reduce_dim) + reduce_sum(
-            label, dim=reduce_dim)
-    dice_score = 1 - inse * 2 / (dice_denominator + epsilon)
-    return reduce_mean(dice_score)
+    return paddle.nn.functional.dice_loss(
+        input, label, epsilon=epsilon, name=name)
 
 
 def image_resize(input,
@@ -7793,10 +7771,18 @@ def _is_list_or_turple_(data):
     }
 
     if out_shape is not None:
-        if isinstance(out_shape, Variable):
+        if isinstance(out_shape, Variable) and not _non_static_mode():
             out_shape.stop_gradient = True
             inputs['OutSize'] = out_shape
         else:
+            if _non_static_mode():
+                if isinstance(out_shape, Variable):
+                    out_shape = list(out_shape.numpy())
+                else:
+                    out_shape = list(out_shape)
+                for i, dim in enumerate(out_shape):
+                    if isinstance(dim, Variable):
+                        out_shape[i] = dim.numpy()[0]
             if not (_is_list_or_turple_(out_shape)):
                 raise TypeError(
                     "out_shape should be a list or tuple or Variable.")
@@ -7863,7 +7849,9 @@ def _is_list_or_turple_(data):
                     attrs['out_w'] = out_shape[2]
 
     else:
-        if isinstance(scale, Variable):
+        if _non_static_mode() and isinstance(scale, Variable):
+            scale = scale.numpy()
+        elif isinstance(scale, Variable):
             scale.stop_gradient = True
             inputs["Scale"] = scale
         elif isinstance(scale, float) or isinstance(scale, int):
@@ -7883,6 +7871,26 @@ def _is_list_or_turple_(data):
         inputs["OutSize"] = actual_shape
     elif actual_shape is not None:
         raise TypeError("actual_shape should either be Variable or None.")
+
+    if _non_static_mode():
+        attr_list = []
+        for k, v in attrs.items():
+            attr_list.append(k)
+            attr_list.append(v)
+        dy_attr = tuple(attr_list)
+
+        if resample_type == "linear":
+            out = _C_ops.linear_interp(input, actual_shape, *dy_attr)
+        elif resample_type == "bilinear":
+            out = _C_ops.bilinear_interp(input, actual_shape, *dy_attr)
+        elif resample_type == "trilinear":
+            out = _C_ops.trilinear_interp(input, actual_shape, *dy_attr)
+        elif resample_type == "nearest":
+            out = _C_ops.nearest_interp(input, actual_shape, *dy_attr)
+        elif resample_type == "bicubic":
+            out = _C_ops.bicubic_interp(input, actual_shape, *dy_attr)
+        return out
+
     out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type='{}_interp'.format(resample_type),
@@ -13573,22 +13581,7 @@ def log_loss(input, label, epsilon=1e-4, name=None):
           prob = paddle.randn((10,1))
           cost = F.log_loss(input=prob, label=label)
     """
-    if in_dygraph_mode():
-        return _C_ops.final_state_log_loss(input, label, epsilon)
-
-    helper = LayerHelper('log_loss', **locals())
-    check_variable_and_dtype(input, 'input', ['float32'], 'log_loss')
-    check_variable_and_dtype(label, 'label', ['float32'], 'log_loss')
-
-    loss = helper.create_variable_for_type_inference(dtype=input.dtype)
-
-    helper.append_op(
-        type='log_loss',
-        inputs={'Predicted': [input],
-                'Labels': [label]},
-        outputs={'Loss': [loss]},
-        attrs={'epsilon': epsilon})
-    return loss
+    return paddle.nn.functional.log_loss(input, label, epsilon, name)
 
 
 def add_position_encoding(input, alpha, beta, name=None):
@@ -13892,33 +13885,8 @@ def temporal_shift(x, seg_num, shift_ratio=0.25, name=None, data_format="NCHW"):
             input = paddle.randn([6, 4, 2, 2])
             out = F.temporal_shift(x=input, seg_num=2, shift_ratio=0.2)
     """
-    if data_format not in ["NCHW", "NHWC"]:
-        raise ValueError("Attr(data_format) should be 'NCHW' or 'NHWC'. "
-                         "Received Attr(data_format): {}.".format(data_format))
-    if _non_static_mode():
-        return _C_ops.temporal_shift(x, 'seg_num', seg_num, 'shift_ratio',
-                                     shift_ratio, 'data_format', data_format)
-
-    helper = LayerHelper("temporal_shift", **locals())
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'temporal_shift')
-    check_type(seg_num, 'seg_num', int, 'temporal_shift')
-    check_type(shift_ratio, 'shift_ratio', float, 'temporal_shift')
-
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-
-    if not isinstance(seg_num, int):
-        raise TypeError("seg_num must be int type.")
-
-    helper.append_op(
-        type="temporal_shift",
-        inputs={"X": x},
-        outputs={"Out": out},
-        attrs={
-            "seg_num": seg_num,
-            "shift_ratio": shift_ratio,
-            "data_format": data_format
-        })
-    return out
+    return paddle.nn.functional.temporal_shift(x, seg_num, shift_ratio, name,
+                                               data_format)
 
 
 class PyFuncRegistry(object):
@@ -15046,63 +15014,8 @@ def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
             y = F.unfold(x, [3, 3], 1, 1, 1)
     """
 
-    helper = LayerHelper("unfold", **locals())
-
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'unfold')
-
-    assert len(x.shape) == 4, \
-            "input should be the format of [N, C, H, W]"
-
-    if isinstance(kernel_sizes, int):
-        kernel_sizes = [kernel_sizes, kernel_sizes]
-    else:
-        assert isinstance(kernel_sizes, list) and (len(kernel_sizes) == 2), \
-            "kernel_sizes should either be an integer or a list of two integers"
-
-    if isinstance(strides, int):
-        strides = [strides, strides]
-    else:
-        assert isinstance(strides, list) and (len(strides) == 2), \
-            "strides should either be an integer or a list of two integers"
-
-    if isinstance(dilations, int):
-        dilations = [dilations, dilations]
-    else:
-        assert isinstance(dilations, list) and (len(dilations) == 2), \
-            "dilations should either be an integer or a list of two integers"
-
-    if isinstance(paddings, int):
-        paddings = [paddings] * 4
-    elif isinstance(paddings, list):
-        if len(paddings) == 2:
-            paddings = paddings * 2
-        elif len(paddings) == 4:
-            pass
-        else:
-            raise ValueError(
-                "paddings should either be an integer or a list of 2 or 4 integers"
-            )
-    else:
-        raise ValueError(
-            "Unexpected type of paddings, it should be either an integer or a list"
-            "of 2 or 4 integers")
-
-    if in_dygraph_mode():
-        return _C_ops.final_state_unfold(x, kernel_sizes, strides, paddings,
-                                         dilations)
-
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type="unfold",
-        inputs={"X": x},
-        outputs={"Y": out},
-        attrs={
-            "kernel_sizes": kernel_sizes,
-            "strides": strides,
-            "paddings": paddings,
-            "dilations": dilations
-        })
-    return out
+    return paddle.nn.functional.unfold(x, kernel_sizes, strides, paddings,
+                                       dilations, name)
 
 
 def deformable_roi_pooling(input,
@@ -15554,26 +15467,7 @@ def gather_tree(ids, parents):
             # [[[2, 2], [1, 6]], [[3, 3], [6, 1]], [[0, 1], [9, 0]]]
 
     """
-    if in_dygraph_mode():
-        return _C_ops.final_state_gather_tree(ids, parents)
-    else:
-        if _in_legacy_dygraph():
-            return _C_ops.gather_tree(ids, parents)
-        else:
-            helper = LayerHelper('gather_tree', **locals())
-            check_variable_and_dtype(ids, 'ids', ['int32', 'int64'],
-                                     'gather_tree')
-            check_variable_and_dtype(parents, 'parents', ['int32', 'int64'],
-                                     'gather_tree')
-            out = helper.create_variable_for_type_inference(dtype=ids.dtype)
-
-            helper.append_op(
-                type="gather_tree",
-                inputs={"Ids": ids,
-                        "Parents": parents},
-                outputs={"Out": out})
-
-            return out
+    return paddle.nn.functional.gather_tree(ids, parents)
 
 
 @deprecated(since="2.0.0", update_to="paddle.uniform")
diff --git a/python/paddle/fluid/layers/sequence_lod.py b/python/paddle/fluid/layers/sequence_lod.py
index 80dc990af4556..702e38f3d2368 100644
--- a/python/paddle/fluid/layers/sequence_lod.py
+++ b/python/paddle/fluid/layers/sequence_lod.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import paddle
 from .layer_function_generator import templatedoc
 from ..framework import core, Variable, _non_static_mode, in_dygraph_mode, _in_legacy_dygraph, convert_np_dtype_to_dtype_
 from ..layer_helper import LayerHelper
@@ -1382,35 +1383,7 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None):
 
     """
 
-    if in_dygraph_mode():
-        if not isinstance(dtype, core.VarDesc.VarType):
-            dtype = convert_np_dtype_to_dtype_(dtype)
-        if maxlen is not None:
-            if isinstance(maxlen, core.eager.Tensor):
-                attrs = ('out_dtype', dtype)
-                out = _C_ops.sequence_mask(x, maxlen, *attrs)
-            else:
-                attrs = ('out_dtype', dtype, 'maxlen', maxlen)
-                out = _C_ops.sequence_mask(x, None, *attrs)
-            out.stop_gradient = True
-            return out
-
-    helper = LayerHelper('sequence_mask', **locals())
-    out = helper.create_variable_for_type_inference(dtype=dtype)
-
-    inputs = {'X': [x]}
-    attrs = {'out_dtype': out.dtype}
-    if maxlen is not None:
-        if isinstance(maxlen, Variable):
-            inputs['MaxLenTensor'] = maxlen
-        else:
-            attrs['maxlen'] = maxlen
-
-    helper.append_op(
-        type='sequence_mask', inputs=inputs, outputs={'Y': out}, attrs=attrs)
-
-    out.stop_gradient = True
-    return out
+    return paddle.nn.functional.sequence_mask(x, maxlen, dtype, name)
 
 
 @templatedoc()
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index b02c154584e9c..3b1fcc15ab95f 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -681,14 +681,19 @@ def assign(input, output=None):
                              "saving it to file and 'load_op' to load it")
         if output is None:
             output = helper.create_variable_for_type_inference(dtype=dtype)
-        helper.append_op(
-            type='assign_value',
-            outputs={'Out': [output]},
-            attrs={
-                'dtype': dtype,
-                'shape': list(input.shape),
-                value_name: values
-            })
+        if _non_static_mode():
+            _C_ops.assign_value(output, 'shape',
+                                list(input.shape), 'dtype', dtype, value_name,
+                                values)
+        else:
+            helper.append_op(
+                type='assign_value',
+                outputs={'Out': [output]},
+                attrs={
+                    'dtype': dtype,
+                    'shape': list(input.shape),
+                    value_name: values
+                })
 
     if is_inplace and _non_static_mode():
         output._bump_inplace_version()
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index a78c820e1e66a..34237d47a5659 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -131,6 +131,8 @@ if(NOT WITH_GPU)
     LIST(REMOVE_ITEM TEST_OPS test_fused_attention_op_api)
     LIST(REMOVE_ITEM TEST_OPS test_fused_multi_transformer_op)
     LIST(REMOVE_ITEM TEST_OPS test_fused_transformer_encoder_layer)
+    LIST(REMOVE_ITEM TEST_OPS test_fused_bias_dropout_residual_layer_norm_op)
+    LIST(REMOVE_ITEM TEST_OPS test_fused_bias_dropout_residual_layer_norm_op_api)
 endif()
 
 LIST(REMOVE_ITEM TEST_OPS test_fused_gemm_epilogue_op)
@@ -327,6 +329,7 @@ if ((NOT WITH_NCCL) AND (NOT WITH_RCCL))
 endif()
 
 if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
+    LIST(REMOVE_ITEM TEST_OPS test_fused_gate_attention_op)
     LIST(REMOVE_ITEM TEST_OPS test_boxps)
 endif()
 list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
index 346939fb5ce28..381461130ed5c 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
@@ -31,4 +31,5 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
     py_test_modules(test_cluster MODULES test_cluster ENVS ${dist_ENVS})
     py_test_modules(test_comm_cost MODULES test_comm_cost ENVS ${dist_ENVS})
     py_test_modules(test_comp_cost MODULES test_comp_cost ENVS ${dist_ENVS})
+    py_test_modules(test_dist_context MODULES test_dist_context ENVS ${dist_ENVS})
 endif()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_comp_cost.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_comp_cost.py
index 4cdd51e42adf0..af7a44b5aaa23 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_comp_cost.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_comp_cost.py
@@ -54,6 +54,35 @@
 from paddle.distributed.auto_parallel.cost.comp_op_cost import MatmulOpCost
 from paddle.distributed.auto_parallel.cost.comp_op_cost import MatmulGradOpCost
 from paddle.distributed.auto_parallel.cost.comp_op_cost import MatmulV2OpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import MatmulV2GradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import MemcpyOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import MulOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import MulGradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import OneHotOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import ReadFromArrayOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import ReduceSumOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import ReduceSumGradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import Reshape2OpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import Reshape2GradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import ReduceMeanOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import ReduceMeanGradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import SamplingIdOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import ScaleOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import SliceOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import SoftmaxOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import SoftmaxGradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import SoftmaxWithCrossEntropyOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import SoftmaxWithCrossEntropyGradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import SplitOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import Squeeze2OpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import SquareOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import SquareGradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import SumOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import TopKOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import Transpose2OpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import Transpose2GradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import Unsqueeze2OpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import WriteToArrayOpCost
 
 from test_cluster import cluster_json
 
@@ -244,6 +273,155 @@ def test_comp_cost(self):
         self.assertTrue(op_cost.time >= 0)
         self.assertTrue(op_cost.memory >= 0)
 
+        op_cost = MatmulV2GradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = MemcpyOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = MulOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = MulGradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = OneHotOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = ReadFromArrayOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = ReduceSumOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = ReduceSumGradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = Reshape2OpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = MatmulV2OpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = Reshape2GradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = ReduceMeanOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = ReduceMeanGradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = SamplingIdOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = ScaleOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = SliceOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = SoftmaxOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = SoftmaxGradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = SoftmaxWithCrossEntropyOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = SoftmaxWithCrossEntropyGradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = SplitOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = Squeeze2OpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = SquareOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = SquareGradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = SumOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = TopKOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = Transpose2OpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = Transpose2GradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = Unsqueeze2OpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = WriteToArrayOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
         # Remove unnecessary files
         if os.path.exists(cluster_json_path):
             os.remove(cluster_json_path)
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_context.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_context.py
new file mode 100644
index 0000000000000..f7718e584f5e1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_context.py
@@ -0,0 +1,204 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import json
+
+import paddle
+import numpy as np
+import paddle.nn as nn
+import paddle.utils as utils
+import paddle.static as static
+import paddle.nn.functional as F
+
+from paddle.distributed import fleet
+import paddle.distributed.auto_parallel as auto
+from paddle.distributed.auto_parallel.dist_context import DistributedContext
+from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
+
+paddle.enable_static()
+
+batch_size = 4
+hidden_size = 1024
+sequence_len = 512
+_g_process_mesh = [[0, 1], [2, 3]]
+
+
+def get_random_inputs_and_labels(input_shape, label_shape):
+    input = np.random.random(size=input_shape).astype('float32')
+    label = np.random.random(size=label_shape).astype('float32')
+    return input, label
+
+
+def batch_generator_creator():
+    def __reader__():
+        for _ in range(batch_size):
+            batch_input, batch_label = get_random_inputs_and_labels(
+                [batch_size, sequence_len, hidden_size],
+                [batch_size, sequence_len, 1])
+            yield batch_input, batch_label
+
+    return __reader__
+
+
+class MLPLayer(nn.Layer):
+    def __init__(self,
+                 hidden_size=1024,
+                 intermediate_size=4 * 1024,
+                 dropout_ratio=0.1,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        param_initializer = nn.initializer.Normal(
+            mean=0.0, std=initializer_range)
+
+        self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
+        self.linear0 = nn.Linear(
+            d_model,
+            dim_feedforward,
+            weight_attr=paddle.ParamAttr(initializer=param_initializer),
+            bias_attr=None)
+        self.linear1 = nn.Linear(
+            dim_feedforward,
+            d_model,
+            weight_attr=paddle.ParamAttr(initializer=param_initializer),
+            bias_attr=None)
+
+    def forward(self, input):
+        out = self.norm(input)
+        auto.shard_tensor(
+            self.linear0.weight,
+            dist_attr={
+                "process_mesh": _g_process_mesh[0],
+                "dims_mapping": [-1, 0]
+            })
+        out = self.linear0(out)
+        out = F.gelu(out, approximate=True)
+        auto.shard_tensor(
+            self.linear1.weight,
+            dist_attr={
+                "process_mesh": _g_process_mesh[1],
+                "dims_mapping": [0, -1]
+            })
+        out = self.linear1(out)
+
+        return out
+
+
+def get_program():
+    dist_strategy = fleet.DistributedStrategy()
+    dist_strategy.semi_auto = True
+    # fleet.init(is_collective=True, strategy=dist_strategy)
+
+    train_program = static.Program()
+    start_program = static.Program()
+    with static.program_guard(train_program, start_program):
+        # input
+        input = static.data(
+            name="input",
+            shape=[batch_size, sequence_len, hidden_size],
+            dtype='float32')
+        label = static.data(
+            name="label", shape=[batch_size, sequence_len, 1], dtype='float32')
+        data_holder = [input, label]
+        # dataloader
+        dataloader = paddle.io.DataLoader.from_generator(
+            feed_list=data_holder, capacity=4 * batch_size, iterable=False)
+        dataloader.set_batch_generator(
+            batch_generator_creator(), places=paddle.static.cuda_places())
+        # data dist_attr
+        auto.shard_tensor(
+            input,
+            dist_attr={
+                "process_mesh": _g_process_mesh[0],
+                "dims_mapping": [0, -1, -1]
+            })
+        auto.shard_tensor(
+            label,
+            dist_attr={
+                "process_mesh": _g_process_mesh[0],
+                "dims_mapping": [0, -1, -1]
+            })
+
+        mlp_start = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            dropout_ratio=0.1,
+            initializer_range=0.02)
+        pred = mlp_start(input)
+
+        mlp_mid = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            dropout_ratio=0.1,
+            initializer_range=0.02)
+        pred = mlp_mid(pred)
+
+        mlp_end = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            dropout_ratio=0.1,
+            initializer_range=0.02)
+        pred = mlp_end(pred)
+
+        error_cost = paddle.nn.functional.square_error_cost(pred, label)
+        loss = paddle.mean(error_cost)
+
+        optimizer = paddle.optimizer.Adam(
+            learning_rate=0.00001,
+            beta1=0.9,
+            beta2=0.999,
+            epsilon=1e-08,
+            grad_clip=None)
+
+        feed_vars = {"inputs": [input], "labels": [label]}
+        fetch_vars = {"loss": [loss]}
+
+    return train_program, start_program, dataloader, loss, optimizer, feed_vars, fetch_vars
+
+
+class TestDistributedContext(unittest.TestCase):
+    def test_backup_restore(self):
+        train_program, start_program, dataloader, loss, optimizer, feed_vars, fetch_vars = get_program(
+        )
+        dist_context = DistributedContext(train_program, start_program,
+                                          optimizer, loss, feed_vars,
+                                          fetch_vars)
+        dist_context.initialize()
+
+        dist_context._backup(serial=True, dist=True)
+        dist_context._restore(
+            serial=True,
+            serial_mode="to_backup",
+            dist=True,
+            dist_mode="to_backup")
+
+        dist_context._backup(serial=True, dist=True)
+        dist_context._restore(
+            serial=True,
+            serial_mode="to_original",
+            dist=True,
+            dist_mode="to_original")
+
+        dist_context._backup(serial=True, dist=True)
+        dist_context._restore(serial=True, dist=True, dist_mode="to_default")
+
+        dist_context._backup(serial=True, dist=True)
+        dist_context._restore(serial=True, dist=True, dist_mode="to_nothing")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py
index aa0bf719fab29..8af055a09a343 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py
@@ -94,7 +94,8 @@ def test_dist_slice_serial(self):
         ops = dist_main_prog.global_block().ops
         for op in ops:
             op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
-            assert op_dist_attr.impl_type == "slice"
+            # We amend this impl_type after completion
+            assert op_dist_attr.impl_type == "default"
             for out in op.output_arg_names:
                 var_dims_mapping = op_dist_attr.get_output_dims_mapping(out)
                 ref_dims_mapping = [-1 for i in range(len(var_dims_mapping))]
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py
index 1179fd9a9f088..9989f5bbdc605 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py
@@ -27,7 +27,7 @@
 from paddle.distributed.auto_parallel.utils import make_data_unshard
 from paddle.distributed.auto_parallel.dist_attribute import OperatorDistributedAttribute, TensorDistributedAttribute
 from paddle.distributed.auto_parallel.dist_context import DistributedContext, get_default_distributed_context
-from paddle.distributed.auto_parallel.operators import find_best_compatible_distributed_operator_impl
+from paddle.distributed.auto_parallel.operators import find_compatible_distributed_operator_impls
 from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
 
 paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_partition.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_partition.py
index 894bed7108a1d..d296d9433302d 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_partition.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_partition.py
@@ -28,7 +28,7 @@
 from paddle.distributed.auto_parallel.utils import make_data_unshard
 from paddle.distributed.auto_parallel.dist_attribute import OperatorDistributedAttribute, TensorDistributedAttribute
 from paddle.distributed.auto_parallel.dist_context import DistributedContext, get_default_distributed_context
-from paddle.distributed.auto_parallel.operators import find_best_compatible_distributed_operator_impl
+from paddle.distributed.auto_parallel.operators import find_compatible_distributed_operator_impls
 from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
 
 paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/collective_global_gather.py b/python/paddle/fluid/tests/unittests/collective_global_gather.py
index d3a6071ed04df..164abe0593491 100644
--- a/python/paddle/fluid/tests/unittests/collective_global_gather.py
+++ b/python/paddle/fluid/tests/unittests/collective_global_gather.py
@@ -23,6 +23,7 @@
 import paddle.fluid.layers as layers
 from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
 import pickle
+from paddle.fluid.framework import _enable_legacy_dygraph
 
 paddle.enable_static()
 
@@ -74,6 +75,9 @@ def run_trainer(self, args):
         world_size = 2
         tot_expert = n_expert * world_size
         paddle.disable_static()
+
+        # Call paddle.distributed.alltoall() under legacy dygraph
+        _enable_legacy_dygraph()
         np.random.seed(os.getpid())
         local_expert_count = np.random.randint(
             1, 4, size=tot_expert).astype("int64")
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
index 487a69807e2b0..39f55fb45b87b 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
@@ -61,7 +61,6 @@ def setUp(self):
         self.fuse_activation = ""
         self.fuse_alpha = 0
         self.fuse_beta = 0
-        self.fuse_brelu_threshold = 6.0
         self.fuse_residual_connection = False
         self.input_residual_size = None
 
@@ -99,7 +98,6 @@ def setUp(self):
         self.attrs['fuse_activation'] = self.fuse_activation
         self.attrs['fuse_alpha'] = self.fuse_alpha
         self.attrs['fuse_beta'] = self.fuse_beta
-        self.attrs['fuse_brelu_threshold'] = self.fuse_brelu_threshold
         self.attrs['fuse_residual_connection'] = self.fuse_residual_connection
 
         self.outputs['Output'] = output
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_mkldnn_op.py
index 46ee2a14a2018..7b0bb706aece9 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_mkldnn_op.py
@@ -14,7 +14,7 @@
 
 import unittest
 import numpy as np
-from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci
+from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool, skip_check_grad_ci
 import paddle.fluid as fluid
 import paddle
 
@@ -92,6 +92,17 @@ def setUp(self):
         self.outputs = {'Out': self.inputs['X'].sum()}
 
 
+@OpTestTool.skip_if_not_cpu()
+class TestReduceSum4DNoReduceSimpleCopyOneDNNOp(
+        TestReduceDefaultWithGradOneDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float32")}
+        self.attrs = {'dim': tuple(), 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': np.copy(self.inputs['X'])}
+
+
 @skip_check_grad_ci(
     reason="reduce_max is discontinuous non-derivable function,"
     " its gradient check is not supported by unittest framework.")
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_abs_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_abs_op_mlu.py
new file mode 100644
index 0000000000000..0c33bd6b1ade8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_abs_op_mlu.py
@@ -0,0 +1,95 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append('..')
+from op_test import OpTest
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+import paddle
+import paddle.nn.functional as F
+
+paddle.enable_static()
+np.random.seed(10)
+
+
+class TestAbs(OpTest):
+    def setUp(self):
+        self.op_type = "abs"
+        self.set_mlu()
+        self.dtype = 'float32'
+        self.shape = [4, 25]
+
+        np.random.seed(1024)
+        x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        # Because we set delta = 0.005 in calculating numeric gradient,
+        # if x is too small, such as 0.002, x_neg will be -0.003
+        # x_pos will be 0.007, so the numeric gradient is inaccurate.
+        # we should avoid this
+        x[np.abs(x) < 0.005] = 0.02
+        out = np.abs(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            self.place, ['X'], ['Out'], check_eager=False)
+
+
+class TestAbsHalf(OpTest):
+    def setUp(self):
+        self.op_type = "abs"
+        self.set_mlu()
+        self.dtype = 'float16'
+        self.shape = [7, 9, 13, 19]
+
+        np.random.seed(1024)
+        x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        # Because we set delta = 0.005 in calculating numeric gradient,
+        # if x is too small, such as 0.002, x_neg will be -0.003
+        # x_pos will be 0.007, so the numeric gradient is inaccurate.
+        # we should avoid this
+        x[np.abs(x) < 0.005] = 0.02
+        out = np.abs(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            self.place, ['X'], ['Out'], check_eager=False)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_log_softmax_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_log_softmax_op_mlu.py
new file mode 100644
index 0000000000000..dea6391b8bae0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_log_softmax_op_mlu.py
@@ -0,0 +1,163 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
+import paddle
+import paddle.fluid.core as core
+import paddle.nn.functional as F
+
+np.random.seed(10)
+paddle.enable_static()
+
+
+def ref_log_softmax(x):
+    shiftx = (x - np.max(x))
+    out = shiftx - np.log(np.exp(shiftx).sum())
+    return out
+
+
+def ref_log_softmax_grad(x, axis):
+    if axis < 0:
+        axis += len(x.shape)
+    out = np.apply_along_axis(ref_log_softmax, axis, x)
+    axis_dim = x.shape[axis]
+    dout = np.full_like(x, fill_value=1. / x.size)
+    dx = dout - np.exp(out) * dout.copy().sum(axis=axis, keepdims=True).repeat(
+        axis_dim, axis=axis)
+    return dx
+
+
+class TestLogSoftmaxOp(OpTest):
+    def setUp(self):
+        self.op_type = 'log_softmax'
+        self.set_mlu()
+        self.python_api = F.log_softmax
+        self.dtype = 'float32'
+        self.shape = [2, 3, 4, 5]
+        self.axis = -1
+        self.set_attrs()
+
+        x = np.random.uniform(0.1, 1., self.shape).astype(self.dtype)
+        out = np.apply_along_axis(ref_log_softmax, self.axis, x)
+        self.x_grad = ref_log_softmax_grad(x, self.axis)
+
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}
+        self.attrs = {'axis': self.axis}
+
+    def set_attrs(self):
+        pass
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            self.place, ['X'], ['Out'], user_defined_grads=[self.x_grad])
+
+
+class TestLogSoftmaxShape(TestLogSoftmaxOp):
+    def set_attrs(self):
+        self.shape = [12, 10]
+
+
+class TestLogSoftmaxAxis(TestLogSoftmaxOp):
+    def set_attrs(self):
+        self.axis = 1
+
+
+class TestNNLogSoftmaxAPI(unittest.TestCase):
+    def setUp(self):
+        self.set_mlu()
+        self.x_shape = [2, 3, 4, 5]
+        self.x = np.random.uniform(-1., 1., self.x_shape).astype(np.float32)
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def check_api(self, axis=-1):
+        ref_out = np.apply_along_axis(ref_log_softmax, axis, self.x)
+
+        logsoftmax = paddle.nn.LogSoftmax(axis)
+        # test static api
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.fluid.data(name='x', shape=self.x_shape)
+            y = logsoftmax(x)
+            exe = paddle.static.Executor(self.place)
+            out = exe.run(feed={'x': self.x}, fetch_list=[y])
+        self.assertTrue(np.allclose(out[0], ref_out))
+
+        # test dygrapg api
+        paddle.disable_static()
+        x = paddle.to_tensor(self.x)
+        y = logsoftmax(x)
+        self.assertTrue(np.allclose(y.numpy(), ref_out))
+        paddle.enable_static()
+
+    def test_check_api(self):
+        for axis in [-1, 1]:
+            self.check_api(axis)
+
+
+class TestNNFunctionalLogSoftmaxAPI(unittest.TestCase):
+    def setUp(self):
+        self.set_mlu()
+        self.x_shape = [2, 3, 4, 5]
+        self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def check_api(self, axis=-1, dtype=None):
+        x = self.x.copy()
+        if dtype is not None:
+            x = x.astype(dtype)
+        ref_out = np.apply_along_axis(ref_log_softmax, axis, x)
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.fluid.data(name='x', shape=self.x_shape)
+            y = F.log_softmax(x, axis, dtype)
+            exe = paddle.static.Executor(self.place)
+            out = exe.run(feed={'x': self.x}, fetch_list=[y])
+        self.assertTrue(np.allclose(out[0], ref_out))
+
+        paddle.disable_static()
+        x = paddle.to_tensor(self.x)
+        y = F.log_softmax(x, axis, dtype)
+        self.assertTrue(np.allclose(y.numpy(), ref_out), True)
+        paddle.enable_static()
+
+    def test_check_api(self):
+        for axis in [-1, 1]:
+            self.check_api(axis)
+        self.check_api(-1, 'float32')
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.fluid.data(name='X1', shape=[100], dtype='int32')
+            self.assertRaises(TypeError, F.log_softmax, x)
+
+            x = paddle.fluid.data(name='X2', shape=[100], dtype='float32')
+            self.assertRaises(TypeError, F.log_softmax, x, dtype='int32')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_one_hot_v2_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_one_hot_v2_op_mlu.py
new file mode 100644
index 0000000000000..a56e9ff7558f6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_one_hot_v2_op_mlu.py
@@ -0,0 +1,235 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import math
+import sys
+sys.path.append('..')
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.framework as framework
+from paddle.fluid.framework import Program, program_guard, _test_eager_guard
+
+paddle.enable_static()
+
+
+class TestOneHotOp(OpTest):
+    def setUp(self):
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+        self.op_type = 'one_hot_v2'
+        depth = 10
+        depth_np = np.array(10).astype('int32')
+        dimension = 12
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+        x = np.array(x).astype('int32').reshape([sum(x_lod[0])])
+
+        out = np.zeros(shape=(np.product(x.shape), depth)).astype('float32')
+
+        for i in range(np.product(x.shape)):
+            out[i, x[i]] = 1.0
+
+        self.inputs = {'X': (x, x_lod), 'depth_tensor': depth_np}
+        self.attrs = {'dtype': int(core.VarDesc.VarType.FP32)}
+        self.outputs = {'Out': (out, x_lod)}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestOneHotOp_attr(OpTest):
+    def setUp(self):
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+        self.op_type = 'one_hot_v2'
+        depth = 10
+        dimension = 12
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+        x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
+
+        out = np.zeros(shape=(np.product(x.shape[:-1]), 1,
+                              depth)).astype('float32')
+
+        for i in range(np.product(x.shape)):
+            out[i, 0, x[i]] = 1.0
+
+        self.inputs = {'X': (x, x_lod)}
+        self.attrs = {'dtype': int(core.VarDesc.VarType.FP32), 'depth': depth}
+        self.outputs = {'Out': (out, x_lod)}
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestOneHotOp_default_dtype(OpTest):
+    def setUp(self):
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+        self.op_type = 'one_hot_v2'
+        depth = 10
+        depth_np = np.array(10).astype('int32')
+        dimension = 12
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+        x = np.array(x).astype('int32').reshape([sum(x_lod[0])])
+
+        out = np.zeros(shape=(np.product(x.shape), depth)).astype('float32')
+
+        for i in range(np.product(x.shape)):
+            out[i, x[i]] = 1.0
+
+        self.inputs = {'X': (x, x_lod), 'depth_tensor': depth_np}
+        self.attrs = {}
+        self.outputs = {'Out': (out, x_lod)}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestOneHotOp_default_dtype_attr(OpTest):
+    def setUp(self):
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+        self.op_type = 'one_hot_v2'
+        depth = 10
+        dimension = 12
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+        x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
+
+        out = np.zeros(shape=(np.product(x.shape[:-1]), 1,
+                              depth)).astype('float32')
+
+        for i in range(np.product(x.shape)):
+            out[i, 0, x[i]] = 1.0
+
+        self.inputs = {'X': (x, x_lod)}
+        self.attrs = {'depth': depth}
+        self.outputs = {'Out': (out, x_lod)}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestOneHotOp_exception(unittest.TestCase):
+    def setUp(self):
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+        self.op_type = 'one_hot_v2'
+        self.depth = 10
+        self.place = core.CPUPlace()
+        self.dimension = 12
+        self.x = core.LoDTensor()
+        x_lod = [[4, 1, 3, 3]]
+        data = [np.random.randint(11, 20) for i in range(sum(x_lod[0]))]
+        data = np.array(data).astype('int').reshape([sum(x_lod[0]), 1])
+        self.x.set(data, self.place)
+        self.x.set_recursive_sequence_lengths(x_lod)
+
+    def test_check_output(self):
+        program = Program()
+        with program_guard(program):
+            x = fluid.layers.data(
+                name='x', shape=[self.dimension], dtype='float32', lod_level=1)
+            block = program.current_block()
+            one_hot_out = block.create_var(
+                name="one_hot_out",
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                dtype='float32')
+            block.append_op(
+                type='one_hot',
+                inputs={'X': x},
+                attrs={'depth': self.depth},
+                outputs={'Out': one_hot_out})
+            exe = fluid.Executor(self.place)
+
+            def run():
+                exe.run(feed={'x': self.x},
+                        fetch_list=[one_hot_out],
+                        return_numpy=False)
+
+            self.assertRaises(ValueError, run)
+
+
+class TestOneHotOpApi(unittest.TestCase):
+    def setUp(self):
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+
+    def test_api(self):
+        depth = 10
+        self._run(depth)
+
+    def test_api_with_depthTensor(self):
+        depth = fluid.layers.assign(input=np.array([10], dtype=np.int32))
+        self._run(depth)
+
+    def test_api_with_dygraph(self):
+        depth = 10
+        label = np.array([np.random.randint(0, depth - 1)
+                          for i in range(6)]).reshape([6, 1])
+        with fluid.dygraph.guard():
+            one_hot_label = fluid.one_hot(
+                input=fluid.dygraph.to_variable(label), depth=depth)
+
+            one_hot_label = paddle.nn.functional.one_hot(
+                fluid.dygraph.to_variable(label), depth)
+            # with _test_eager_guard():
+            #     one_hot_label = paddle.nn.functional.one_hot(
+            #         paddle.to_tensor(label), depth)
+
+    def _run(self, depth):
+        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+        one_hot_label = fluid.one_hot(input=label, depth=depth)
+
+        label_data = np.array([np.random.randint(0, 10 - 1)
+                               for i in range(6)]).reshape([6, 1])
+
+        exe = fluid.Executor(self.place)
+        exe.run(fluid.default_startup_program())
+        ret = exe.run(feed={'label': label_data, },
+                      fetch_list=[one_hot_label],
+                      return_numpy=False)
+
+
+class BadInputTestOnehotV2(unittest.TestCase):
+    def setUp(self):
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+
+    def test_error(self):
+        with fluid.program_guard(fluid.Program()):
+
+            def test_bad_x():
+                label = fluid.layers.data(
+                    name="label",
+                    shape=[4],
+                    append_batch_size=False,
+                    dtype="float32")
+                one_hot_label = fluid.one_hot(input=label, depth=4)
+
+            self.assertRaises(TypeError, test_bad_x)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_arg_max_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_arg_max_op_npu.py
index 85ade1179b7d6..c6135383721e1 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_arg_max_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_arg_max_op_npu.py
@@ -328,5 +328,32 @@ def run(place):
             run(place)
 
 
+class TestArgMaxAPI_3(unittest.TestCase):
+    def initTestCase(self):
+        self.dims = (1, 9)
+        self.dtype = 'float32'
+
+    def setUp(self):
+        self.initTestCase()
+        self.__class__.use_npu = True
+        self.place = [paddle.NPUPlace(0)]
+
+    def test_dygraph_api(self):
+        def run(place):
+            paddle.disable_static(place)
+            np.random.seed(2021)
+            numpy_input = (np.random.random(self.dims)).astype(self.dtype)
+            tensor_input = paddle.to_tensor(numpy_input)
+            numpy_output = np.argmax(numpy_input).reshape([1])
+            paddle_output = paddle.argmax(tensor_input)
+            self.assertEqual(
+                np.allclose(numpy_output, paddle_output.numpy()), True)
+            self.assertEqual(numpy_output.shape, paddle_output.numpy().shape)
+            paddle.enable_static()
+
+        for place in self.place:
+            run(place)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index f7a3dfa1102b2..fe1dbf3b92743 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -263,6 +263,18 @@ def wrapper(cls):
     return wrapper
 
 
+def skip_check_inplace_ci(reason=None):
+    if not isinstance(reason, str):
+        raise AssertionError(
+            "The reason for skipping check_inplace is required.")
+
+    def wrapper(cls):
+        cls.no_need_check_inplace = True
+        return cls
+
+    return wrapper
+
+
 def copy_bits_from_float_to_uint16(f):
     return struct.unpack('<I', struct.pack('<f', f))[0] >> 16
 
@@ -1288,6 +1300,9 @@ def check_inplace_output_with_place(self,
         Returns:
             None
         """
+        if getattr(self, "no_need_check_inplace", False):
+            return
+
         has_infer_inplace = fluid.core.has_infer_inplace(self.op_type)
         has_grad_op_maker = fluid.core.has_grad_op_maker(self.op_type)
 
diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py
index 3e2f112e964bb..225bd35a8ec9d 100644
--- a/python/paddle/fluid/tests/unittests/test_adamw_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py
@@ -271,6 +271,115 @@ def test_adamw_op_dygraph(self):
             adam.clear_gradients()
 
 
+class TestAdamWOpMultiPrecison(unittest.TestCase):
+    def _test_adamw_op_dygraph_place_amp(self, place, use_amp=False):
+        paddle.disable_static()
+        paddle.seed(10)
+        paddle.set_device(place)
+
+        input = paddle.randn((5, 5))
+
+        model = paddle.nn.Linear(5, 5)
+
+        optimizer = paddle.optimizer.AdamW(
+            parameters=[{
+                'params': model.parameters(),
+                'weight_decay': 0.001,
+                'beta1': 0.1,
+                'beta2': 0.99
+            }],
+            multi_precision=use_amp)
+
+        for idx in range(2):
+            if place == 'gpu' and use_amp == True:
+                model = paddle.amp.decorate(models=model, level='O2')
+                scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+
+            if place == 'gpu' and use_amp == True:
+                with paddle.amp.auto_cast(level='O2'):
+                    output = model(input)
+                    loss = paddle.mean(output)
+                scaled = scaler.scale(loss)
+                scaled.backward()
+                scaler.step(optimizer)
+                optimizer.clear_grad()
+            else:
+                output = model(input)
+                loss = paddle.mean(output)
+                loss.backward()
+                optimizer.step()
+                optimizer.clear_grad()
+
+    def _get_places(self):
+        places = ['cpu']
+        if paddle.is_compiled_with_cuda():
+            places.append('gpu')
+        return places
+
+    def test_main(self):
+        for place in self._get_places():
+            use_amp_list = [True, False]
+            for use_amp in use_amp_list:
+                self._test_adamw_op_dygraph_place_amp(place, use_amp)
+
+
+class TestAdamWOpError(unittest.TestCase):
+    def test_api_errors(self):
+        def test_weight_decay_dtype():
+            linear = paddle.nn.Linear(13, 5)
+            adam = paddle.optimizer.AdamW(
+                learning_rate=0.01,
+                parameters=linear.parameters(),
+                weight_decay=1)
+
+        def test_parameters_dtype1():
+            adam = paddle.optimizer.AdamW(
+                learning_rate=0.01,
+                parameters=paddle.randn((5, 5)),
+                weight_decay=0.1)
+
+        def test_parameters_dtype2():
+            linear = paddle.nn.Linear(13, 5)
+            adam = paddle.optimizer.AdamW(
+                learning_rate=0.01,
+                parameters={'params': linear.parameters()},
+                weight_decay=0.1)
+
+        def test_parameters_dtype3():
+            adam = paddle.optimizer.AdamW(
+                learning_rate=0.01, parameters=None, weight_decay=0.1)
+
+        def test_parameters_dtype4():
+            linear = paddle.nn.Linear(13, 5)
+            adam = paddle.optimizer.AdamW(
+                learning_rate=0.01,
+                parameters={'params': set(linear.parameters())},
+                weight_decay=0.1)
+
+        def test_learning_rate_dtype():
+            linear = paddle.nn.Linear(13, 5)
+            adam = paddle.optimizer.AdamW(
+                learning_rate=1,
+                parameters=linear.parameters(),
+                weight_decay=0.1)
+
+        def test_grad_clip_dtype():
+            linear = paddle.nn.Linear(13, 5)
+            adam = paddle.optimizer.AdamW(
+                learning_rate=0.01,
+                parameters=linear.parameters(),
+                weight_decay=0.1,
+                grad_clip=0.1)
+
+        self.assertRaises(TypeError, test_weight_decay_dtype)
+        self.assertRaises(TypeError, test_parameters_dtype1)
+        self.assertRaises(TypeError, test_parameters_dtype2)
+        self.assertRaises(AttributeError, test_parameters_dtype3)
+        self.assertRaises(TypeError, test_parameters_dtype4)
+        self.assertRaises(TypeError, test_learning_rate_dtype)
+        self.assertRaises(TypeError, test_grad_clip_dtype)
+
+
 class TestAdamWOpGroupWithLR(TestAdamWOp):
     def test_adamw_op_dygraph(self):
         paddle.disable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_api_base.py b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
index dbd982947265f..a4e71db3d3850 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_api_base.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
@@ -219,9 +219,9 @@ def check_with_place(self,
             required_envs["GLOO_LOG_LEVEL"] = "TRACE"
 
         if eager_mode:
-            required_envs["FLAGS_enable_eager_mode"] = "%d" % 0
-        else:
             required_envs["FLAGS_enable_eager_mode"] = "%d" % 1
+        else:
+            required_envs["FLAGS_enable_eager_mode"] = "%d" % 0
 
         tr0_out, tr1_out, pid0, pid1 = self._run_cluster(model_file,
                                                          required_envs)
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_layer.py b/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
index 707991352fa5e..dd6dcf6d5e9ae 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
@@ -17,6 +17,8 @@
 import paddle.fluid.dygraph as dg
 import paddle.nn.functional as F
 import paddle.fluid.initializer as I
+import paddle
+from paddle.fluid.framework import _test_eager_guard
 import unittest
 
 
@@ -134,7 +136,8 @@ def functional(self, place):
         return y_np
 
     def paddle_nn_layer(self):
-        x_var = dg.to_variable(self.input)
+        x_var = paddle.to_tensor(self.input)
+        x_var.stop_gradient = False
         conv = nn.Conv3D(
             self.num_channels,
             self.num_filters,
@@ -148,17 +151,23 @@ def paddle_nn_layer(self):
         if not self.no_bias:
             conv.bias.set_value(self.bias)
         y_var = conv(x_var)
+        y_var.backward()
         y_np = y_var.numpy()
-        return y_np
+        t1 = x_var.gradient()
+        return y_np, t1
 
     def _test_equivalence(self, place):
         place = fluid.CPUPlace()
         result1 = self.fluid_layer(place)
         result2 = self.functional(place)
         with dg.guard(place):
-            result3 = self.paddle_nn_layer()
+            result3, g1 = self.paddle_nn_layer()
+            with _test_eager_guard():
+                res_eager, g2 = self.paddle_nn_layer()
         np.testing.assert_array_almost_equal(result1, result2)
         np.testing.assert_array_almost_equal(result2, result3)
+        self.assertTrue(np.allclose(result3, res_eager))
+        self.assertTrue(np.allclose(g1, g2))
 
     def runTest(self):
         place = fluid.CPUPlace()
diff --git a/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py b/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py
index 784d89b93f985..5bff8b3142106 100644
--- a/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py
@@ -503,6 +503,75 @@ def test_grad(self):
             self.func(p)
 
 
+class TestDepthWiseConvDoubleGradCheckCase1(unittest.TestCase):
+    def depthwise_conv2d_wrapper(self, x):
+        return paddle.nn.functional.conv2d(x[0], x[1], groups=4)
+
+    @prog_scope()
+    def func(self, place):
+        x_shape = [2, 4, 3, 3]
+        w_shape = [4, 1, 3, 3]
+        eps = 0.005
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
+        x = layers.data('x', x_shape, False, dtype)
+        w = layers.data('w', w_shape, False, dtype)
+
+        # condition of depthwise conv: 
+        # use_cudnn == False
+        # groups == filters
+        # num_filters % num_channels == 0
+
+        y = paddle.nn.functional.conv2d(x, w, groups=4)
+        x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
+        w_arr = np.random.uniform(-1, 1, w_shape).astype(dtype)
+
+        gradient_checker.double_grad_check(
+            [x, w], y, x_init=[x_arr, w_arr], place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.depthwise_conv2d_wrapper, [x, w],
+            y,
+            x_init=[x_arr, w_arr],
+            place=place)
+
+    def test_grad(self):
+        places = []
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestConv3DDoubleGradCheck_NN(unittest.TestCase):
+    def conv3d_wrapper(self, x):
+        return paddle.nn.functional.conv3d(x[0], x[1])
+
+    @prog_scope()
+    def func(self, place):
+        x_shape = [2, 3, 8, 8, 8]
+        w_shape = [6, 3, 3, 3, 3]
+        eps = 0.005
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
+        x = layers.data('x', x_shape, False, dtype)
+        w = layers.data('w', w_shape, False, dtype)
+        x.persistable = True
+        w.persistable = True
+        y = paddle.nn.functional.conv3d(x, w)
+        x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
+        w_arr = np.random.uniform(-1, 1, w_shape).astype(dtype)
+
+        gradient_checker.double_grad_check(
+            [x, w], y, x_init=[x_arr, w_arr], place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.conv3d_wrapper, [x, w], y, x_init=[x_arr, w_arr], place=place)
+
+    def test_grad(self):
+        places = []
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 if __name__ == "__main__":
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_einsum_v2.py b/python/paddle/fluid/tests/unittests/test_einsum_v2.py
index c58d46edde753..b33a943c9f27e 100644
--- a/python/paddle/fluid/tests/unittests/test_einsum_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_einsum_v2.py
@@ -478,5 +478,23 @@ def test_shape(self):
         self.assertEqual(C.shape, (-1, 384))
 
 
+class TestBF16(unittest.TestCase):
+    """
+    EinsumOp support bfloat16 type, add unittest here for the correctness.
+    """
+
+    def test_shape(self):
+        cuda_major = paddle.version.cuda().split('.')[0].strip()
+        if paddle.is_compiled_with_cuda() and int(cuda_major) >= 11:
+            """ MatmulKernel support bfloat16 only if cuda_major > 11.0.
+            """
+            A = paddle.to_tensor(np.array([1.0, 2.0])).astype(paddle.bfloat16)
+            A = A.cuda()
+            B = paddle.to_tensor(np.array([2.0, 3.0])).astype(paddle.bfloat16)
+            B = B.cuda()
+            C = paddle.einsum('i,i->', A, B)
+            self.assertEqual(C.item(), 8.0)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
index 230bc15e0f1ab..0c8e115d7cebf 100644
--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,439 +15,312 @@
 from __future__ import print_function
 
 import unittest
-import math
+import itertools
 import numpy as np
 import math
 from op_test import OpTest
-import paddle.fluid.core as core
 
 
 # numpy.round has different behavior in comparision to c++ round function
 # so we use round_c instead of numpy.round to align the output data
-def round_c_single_element(x):
-    dtype = type(x)
-    if x >= 0:
-        return dtype(np.floor(x + 0.5))
-    else:
-        return dtype(np.ceil(x - 0.5))
+def round_c_single_element(val):
+    dtype = type(val)
+    if val >= 0:
+        return dtype(np.floor(val + 0.5))
+    return dtype(np.ceil(val - 0.5))
 
 
 round_c = np.vectorize(round_c_single_element)
 
 
-class TestFakeQuantizeOp(OpTest):
-    def setUp(self):
-        self.set_dtype()
-        self.op_type = "fake_quantize_abs_max"
-        self.attrs = {'bit_length': 8}
-        self.inputs = {'X': np.random.random((124, 240)).astype(self.dtype), }
-        scale = np.max(np.abs(self.inputs['X'])).astype(self.dtype)
-        self.outputs = {
-            'Out': round_c(self.inputs['X'] / scale * (
-                (1 << (self.attrs['bit_length'] - 1)) - 1)),
-            'OutScale': np.array(scale).astype(self.dtype),
-        }
-
-    def set_dtype(self):
-        self.dtype = np.float32
+def get_compute_type(dtype):
+    assert dtype in [np.float16, np.float32, np.float64]
+    if dtype == np.float16:
+        return np.float32
+    return dtype
 
-    def test_check_output(self):
-        self.check_output()
 
-
-class TestFakeQuantizeOpFloat16(TestFakeQuantizeOp):
-    def set_dtype(self):
-        self.dtype = np.float16
-
-
-class TestFakeQuantizeOp1(OpTest):
+class TestFakeQuantizeAbsMaxOp(OpTest):
     def setUp(self):
-        self.op_type = "fake_quantize_abs_max"
+        self.op_type = 'fake_quantize_abs_max'
         self.attrs = {'bit_length': 8}
-        self.inputs = {'X': np.zeros((10, 10)).astype("float32"), }
-        scale = np.max(np.abs(self.inputs['X'])).astype("float32")
-        inv_scale = 1.0 / (scale + 1e-6) if scale < 1e-30 else 1.0 / scale
-        self.outputs = {
-            'Out': np.round(self.inputs['X'] * inv_scale * (
-                (1 << (self.attrs['bit_length'] - 1)) - 1)),
-            'OutScale': np.array(scale).astype("float32"),
-        }
-
-    def test_check_output(self):
-        self.check_output()
 
-
-class TestFakeQuantizeOp2(OpTest):
-    def setUp(self):
-        self.op_type = "fake_quantize_abs_max"
-        self.attrs = {'bit_length': 8}
-        self.inputs = {'X': np.full((10, 10), 1e-40).astype("float32"), }
-        scale = np.max(np.abs(self.inputs['X'])).astype("float32")
+    def _fake_quantize_abs_max(self, dtype, input_shape, distribution):
+        input_data = distribution(input_shape).astype(dtype)
+        compute_type = get_compute_type(dtype)
+        scale = np.max(np.abs(input_data))
+        bnt = (1 << (self.attrs['bit_length'] - 1)) - 1
         inv_scale = 1.0 / (scale + 1e-6) if scale < 1e-30 else 1.0 / scale
-        self.outputs = {
-            'Out': np.round(self.inputs['X'] * inv_scale * (
-                (1 << (self.attrs['bit_length'] - 1)) - 1)),
-            'OutScale': np.array(scale).astype("float32"),
-        }
-
-    def test_check_output(self):
+        output_data = round_c(input_data.astype(compute_type) * inv_scale * bnt)
+        self.inputs = {'X': input_data}
+        self.outputs = {'Out': output_data, 'OutScale': scale}
+        self.dtype = dtype
         self.check_output()
 
+    def test_fake_quantize_abs_max(self):
+        self._fake_quantize_abs_max(np.float32, (124, 240), np.random.random)
 
-class TestFakeChannelWiseQuantizeOp(OpTest):
-    def setUp(self):
-        self.set_dtype()
-        self.set_arg()
-        assert self.quant_axis in [0, 1], "quant_axis should be 0 or 1."
+    def test_fake_quantize_abs_max_float16(self):
+        self._fake_quantize_abs_max(np.float16, (124, 240), np.random.random)
 
-        self.op_type = "fake_channel_wise_quantize_abs_max"
-        self.attrs = {'bit_length': 8, 'quant_axis': self.quant_axis}
+    def test_fake_quantize_abs_max_underflow(self):
+        self._fake_quantize_abs_max(np.float32, (10, 10), np.zeros)
 
-        scales = []
-        outputs = self.inputs['X'].copy()
-        bnt = (1 << (self.attrs['bit_length'] - 1)) - 1
-        if self.quant_axis == 0:
-            for i in range(self.inputs['X'].shape[0]):
-                scale_v = np.max(np.abs(self.inputs['X'][i])).astype(self.dtype)
-                scales.append(scale_v)
-                outputs[i] = round_c(
-                    self.dtype(bnt) * (self.dtype(1.0) / scale_v) * outputs[i])
-        elif self.quant_axis == 1:
-            for i in range(self.inputs['X'].shape[1]):
-                scale_v = np.max(np.abs(self.inputs['X'][:, i])).astype(
-                    self.dtype)
-                scales.append(scale_v)
-                outputs[:, i] = round_c(
-                    self.dtype(bnt) * (self.dtype(1.0) / scale_v) *
-                    outputs[:, i])
-
-        self.outputs = {
-            'Out': outputs,
-            'OutScale': np.array(scales).astype(self.dtype),
-        }
+    def test_fake_quantize_abs_max_underflow2(self):
+        self._fake_quantize_abs_max(np.float32, (10, 10),
+                                    lambda shape: np.full(shape, 1e-40))
 
-    def set_arg(self):
-        self.quant_axis = 0
-        self.inputs = {
-            'X': np.random.random((20, 15, 6, 6)).astype(self.dtype),
-        }
 
-    def set_dtype(self):
-        self.dtype = np.float32
+class TestFakeChannelWiseQuantizeAbsMaxOp(OpTest):
+    def setUp(self):
+        self.op_type = 'fake_channel_wise_quantize_abs_max'
+        self.attrs = {'bit_length': 8}
 
-    def test_check_output(self):
+    def _fake_channel_wise_quantize_abs_max(self, dtype, input_shape,
+                                            quant_axis, distribution):
+        assert quant_axis in [0, 1], 'quant_axis should be 0 or 1.'
+        input_data = distribution(input_shape).astype(dtype)
+        compute_type = get_compute_type(dtype)
+        bnt = (1 << (self.attrs['bit_length'] - 1)) - 1
+        compute_axis = tuple(
+            i for i in range(len(input_shape)) if i != quant_axis)
+        scale_broadcast = np.amax(input_data, axis=compute_axis, keepdims=True)
+        output_data = round_c(bnt * input_data.astype(compute_type) /
+                              scale_broadcast)
+        if quant_axis == 1:
+            scale_broadcast = np.transpose(scale_broadcast,
+                                           (1, ) + compute_axis)
+        scale = scale_broadcast.reshape(input_shape[quant_axis], -1)[:, 0]
+        self.inputs = {'X': input_data}
+        self.outputs = {'Out': output_data, 'OutScale': scale}
+        self.dtype = dtype
+        self.attrs['quant_axis'] = quant_axis
         self.check_output()
 
-
-class TestFakeChannelWiseQuantizeOpFloat16(TestFakeChannelWiseQuantizeOp):
-    def set_dtype(self):
-        self.dtype = np.float16
-
-
-class TestFakeChannelWiseQuantizeOp1(TestFakeChannelWiseQuantizeOp):
-    def set_quant_axis(self):
-        self.quant_axis = 1
-        self.inputs = {
-            'X': np.random.random((15, 20, 5, 5)).astype(self.dtype),
-        }
-
-
-class TestFakeChannelWiseQuantizeOp1Float16(TestFakeChannelWiseQuantizeOp1):
-    def set_dtype(self):
-        self.dtype = np.float16
-
-
-class TestFakeChannelWiseQuantizeOp2(TestFakeChannelWiseQuantizeOp):
-    def set_quant_axis(self):
-        self.quant_axis = 0
-        self.inputs = {'X': np.random.random((30, 15)).astype(self.dtype), }
-
-
-class TestFakeChannelWiseQuantizeOp3(TestFakeChannelWiseQuantizeOp):
-    def set_quant_axis(self):
-        self.quant_axis = 1
-        self.inputs = {'X': np.random.random((30, 15)).astype(self.dtype), }
+    def test_fake_channel_wise_quantize_abs_max(self):
+        dtype_options = [np.float32, np.float16]
+        input_shape_quant_axis_options = [[(20, 15, 6, 6), 0],
+                                          [(15, 20, 5, 5), 1], [(30, 15), 0],
+                                          [(30, 15), 1]]
+        for dtype, input_shape_quant_axis in itertools.product(
+                dtype_options, input_shape_quant_axis_options):
+            input_shape, quant_axis = input_shape_quant_axis
+            with self.subTest(
+                    dtype=dtype, input_shape=input_shape,
+                    quant_axis=quant_axis):
+                self._fake_channel_wise_quantize_abs_max(
+                    dtype, input_shape, quant_axis, np.random.random)
 
 
 class TestFakeQuantizeRangeAbsMaxOp(OpTest):
     def setUp(self):
-        self.set_dtype()
-        self.op_type = "fake_quantize_range_abs_max"
-        self.attrs = {
-            'bit_length': int(5),
-            'window_size': int(1),
-            'is_test': False
-        }
-        x = (np.random.random((8, 16, 7, 7)) - 0.5) * 10
-        x = x.astype(self.dtype)
+        self.op_type = 'fake_quantize_range_abs_max'
+        self.attrs = {'bit_length': 5, 'window_size': 1}
+
+    def _fake_quantize_range_abs_max(self,
+                                     dtype,
+                                     input_shape,
+                                     distribution,
+                                     is_test=False):
+        input_data = distribution(input_shape).astype(dtype)
+        compute_type = get_compute_type(dtype)
+        bnt = (1 << (self.attrs['bit_length'] - 1)) - 1
+        in_scale = np.zeros(1).astype(dtype)
+        out_scale = np.zeros(self.attrs['window_size']).astype(dtype)
+        out_scale[0] = np.max(np.abs(input_data))
+        if is_test:
+            out_scale[0] = in_scale[0] = out_scale[0] - 1.0
+            clip_data = np.clip(input_data, -in_scale, in_scale)
+        else:
+            clip_data = input_data
+        output_data = round_c(
+            clip_data.astype(compute_type) / out_scale[0] * bnt)
         self.inputs = {
-            'X': x,
-            'Iter': np.zeros(1).astype("int64"),
-            'InScale': np.zeros(1).astype(self.dtype)
+            'X': input_data,
+            'Iter': np.zeros(1).astype(np.int64),
+            'InScale': in_scale
         }
-        scale = np.max(np.abs(self.inputs['X'])).astype(self.dtype)
-
-        out_scales = np.zeros(self.attrs['window_size']).astype(self.dtype)
-        out_scales[0] = scale
         self.outputs = {
-            'Out': round_c(
-                self.dtype((1 << (self.attrs['bit_length'] - 1)) - 1) *
-                (self.dtype(1.0) / scale) * self.inputs['X']),
-            'OutScale': scale,
-            'OutScales': out_scales,
+            'Out': output_data,
+            'OutScale': out_scale[0],
+            'OutScales': out_scale
         }
-
-    def set_dtype(self):
-        self.dtype = np.float32
-
-    def test_check_output(self):
+        self.dtype = dtype
+        self.attrs['is_test'] = is_test
         self.check_output()
 
-
-class TestFakeQuantizeRangeAbsMaxOpFloat16(TestFakeQuantizeRangeAbsMaxOp):
-    def set_dtype(self):
-        self.dtype = np.float16
+    def test_fake_quantize_range_abs_max(self):
+        dtype_options = [np.float32, np.float16]
+        is_test_options = [False, True]
+        for dtype, is_test in itertools.product(dtype_options, is_test_options):
+            self.attrs['bit_length'] = 8 if is_test else 5
+            with self.subTest(dtype=dtype, is_test=is_test):
+                self._fake_quantize_range_abs_max(
+                    dtype, (8, 16, 7, 7),
+                    lambda shape: (np.random.random(shape) - 0.5) * 10,
+                    is_test=is_test)
 
 
 class TestMovingAverageAbsMaxScaleOp(OpTest):
     def setUp(self):
-        self.op_type = "moving_average_abs_max_scale"
+        self.op_type = 'moving_average_abs_max_scale'
         self.attrs = {'moving_rate': float(0.9), 'is_test': False}
-        accum = np.zeros(1).astype("float32")
-        accum[0] = 1
-        state = np.zeros(1).astype("float32")
-        state[0] = 1
-        x = np.random.random((8, 16, 7, 7)).astype("float32")
-        self.inputs = {
-            'X': x,
-            'InAccum': accum,
-            'InState': state,
-        }
 
-        out = x
-        out_accum = np.zeros(1).astype("float32")
-        out_state = np.zeros(1).astype("float32")
-        out_scale = np.zeros(1).astype("float32")
-        out_accum[0] = self.attrs['moving_rate'] * accum[0] + np.max(
-            np.abs(self.inputs['X'])).astype("float32")
-        out_state[0] = self.attrs['moving_rate'] * state[0] + 1
+    def _moving_average_abs_max_scale(self, dtype, input_shape, distribution):
+        input_data = distribution(input_shape).astype(dtype)
+        in_accum = np.ones(1).astype(dtype)
+        in_state = np.ones(1).astype(dtype)
+        out_accum = self.attrs['moving_rate'] * in_accum[0] + np.max(
+            np.abs(input_data))
+        out_state = self.attrs['moving_rate'] * in_state[0] + 1.0
         out_scale = out_accum / out_state
+        self.inputs = {
+            'X': input_data,
+            'InAccum': in_accum,
+            'InState': in_state
+        }
         self.outputs = {
-            'Out': out,
+            'Out': input_data,
             'OutAccum': out_accum,
             'OutState': out_state,
-            'OutScale': out_scale,
+            'OutScale': out_scale
         }
-
-    def test_check_output(self):
+        self.dtype = dtype
         self.check_output()
 
+    def test_moving_average_abs_max(self):
+        self._moving_average_abs_max_scale(np.float32, (8, 16, 7, 7),
+                                           np.random.random)
 
-class TestFakeQuantizeRangeAbsMaxOp2(OpTest):
-    def setUp(self):
-        self.set_dtype()
-        self.op_type = "fake_quantize_range_abs_max"
-        self.attrs = {
-            'bit_length': int(8),
-            'window_size': int(1),
-            'is_test': True
-        }
-        x = (np.random.random((8, 16, 7, 7)) - 0.5) * 10
-        x = x.astype(self.dtype)
-        scale = np.array([np.max(np.abs(x)).astype(self.dtype) - 1.0])
-        out_scales = np.zeros(self.attrs['window_size']).astype(self.dtype)
-        out_scales[0] = scale.astype(self.dtype)
-        self.inputs = {
-            'X': x,
-            'Iter': np.zeros(1).astype("int64"),
-            'InScale': scale.astype(self.dtype)
-        }
-        xs = np.clip(x, -scale, scale).astype(self.dtype)
-        qs = round_c(
-            self.dtype(
-                self.dtype((1 << (self.attrs['bit_length'] - 1)) - 1) * (
-                    self.dtype(1.0) / scale) * xs))
-        self.outputs = {
-            'Out': qs,
-            'OutScale': scale.astype(self.dtype),
-            'OutScales': out_scales,
-        }
-
-    def set_dtype(self):
-        self.dtype = np.float32
-
-    def test_check_output(self):
-        self.check_output(no_check_set=set(['OutScale', 'OutScales']))
-
-
-class TestFakeQuantizeRangeAbsMaxOp2Float16(TestFakeQuantizeRangeAbsMaxOp2):
-    def set_dtype(self):
-        self.dtype = np.float16
 
-
-class TestMovingOpBase(OpTest):
+class TestFakeQuantizeMovingAverageAbsMaxOp(OpTest):
     def setUp(self):
-        self.set_dtype()
-        self.init_type()
-        self.attrs = {
-            'bit_length': int(5),
-            'moving_rate': float(0.9),
-            'is_test': False
-        }
-        accum = np.zeros(1).astype(self.dtype)
-        accum[0] = 1
-        state = np.zeros(1).astype(self.dtype)
-        state[0] = self.dtype(1.0)
-        scale = np.zeros(1).astype(self.dtype)
-        scale[0] = 0.001
+        self.op_type = 'fake_quantize_moving_average_abs_max'
+        self.attrs = {'bit_length': 5, 'moving_rate': 0.9, 'is_test': False}
+
+    def _fake_quantize_moving_average_abs_max(self,
+                                              dtype,
+                                              input_shape,
+                                              distribution,
+                                              dequantize=False,
+                                              with_gradient=False):
+        input_data = distribution(input_shape).astype(dtype)
+        compute_type = get_compute_type(dtype)
+        bnt = (1 << (self.attrs['bit_length'] - 1)) - 1
+        in_accum = np.ones(1).astype(dtype)
+        in_state = np.ones(1).astype(dtype)
+        in_scale = np.array([0.001]).astype(dtype)
+        out_accum = np.zeros(1).astype(dtype)
+        out_state = np.zeros(1).astype(dtype)
+        out_scale = np.zeros(1).astype(dtype)
+        out_accum[0] = self.attrs['moving_rate'] * in_accum[0] + np.max(
+            np.abs(input_data))
+        out_state[0] = self.attrs['moving_rate'] * in_state[0] + 1.0
+        out_scale = out_accum / out_state
+        round_data = round_c(input_data.astype(compute_type) / out_scale * bnt)
+        if dequantize:
+            output_data = (round_data * out_scale / bnt).astype(dtype)
+            self.op_type = 'fake_quantize_dequantize_moving_average_abs_max'
+        else:
+            output_data = round_data.astype(dtype)
         self.inputs = {
-            'X': np.random.random((8, 16, 7, 7)).astype(self.dtype),
-            'InScale': scale,
-            'InAccum': accum,
-            'InState': state,
+            'X': input_data,
+            'InScale': in_scale,
+            'InAccum': in_accum,
+            'InState': in_state
         }
-
-        out_accum = np.zeros(1).astype(self.dtype)
-        out_state = np.zeros(1).astype(self.dtype)
-        out_scale = np.zeros(1).astype(self.dtype)
-        out_accum[0] = self.dtype(self.attrs['moving_rate']) * self.dtype(accum[
-            0]) + np.max(np.abs(self.inputs['X'])).astype(self.dtype)
-        out_state[0] = self.dtype(self.attrs['moving_rate']) * self.dtype(state[
-            0]) + self.dtype(1.0)
-        out_scale = self.dtype(self.dtype(out_accum) / self.dtype(out_state))
-        out_data = self.calc_output(out_scale)
         self.outputs = {
-            'Out': out_data,
+            'Out': output_data,
             'OutAccum': out_accum,
             'OutState': out_state,
-            'OutScale': out_scale,
+            'OutScale': out_scale
         }
-
-    def set_dtype(self):
-        self.dtype = np.float32
-
-    def init_type(self):
-        self.op_type = "fake_quantize_moving_average_abs_max"
-
-    def calc_output(self, out_scale):
-        return round_c(self.inputs['X'] / out_scale * (
-            (1 << (self.attrs['bit_length'] - 1)) - 1))
-
-    def test_check_output(self):
+        self.dtype = dtype
         self.check_output()
+        if with_gradient:
+            gradient = [
+                np.ones(input_data.shape) / np.product(input_data.shape)
+            ]
+            self.check_grad(['X'], 'Out', user_defined_grads=gradient)
 
+    def test_fake_quantize_moving_average_abs_max(self):
+        self._fake_quantize_moving_average_abs_max(np.float32, (8, 16, 7, 7),
+                                                   np.random.random)
 
-class TestMovingOpBaseFloat16(TestMovingOpBase):
-    def set_dtype(self):
-        self.dtype = np.float16
-
-    def test_check_output(self):
-        self.check_output(atol=1e-2)
+    def test_fake_quantize_moving_average_abs_max_float16(self):
+        self._fake_quantize_moving_average_abs_max(np.float16, (8, 16, 7, 7),
+                                                   np.random.random)
 
+    def test_fake_quantize_dequantize_moving_average_abs_max(self):
+        self._fake_quantize_moving_average_abs_max(
+            np.float32, (8, 16, 7, 7),
+            np.random.random,
+            dequantize=True,
+            with_gradient=True)
 
-class TestFakeQuantDequantMovingOp(TestMovingOpBase):
-    def init_type(self):
-        self.op_type = "fake_quantize_dequantize_moving_average_abs_max"
 
-    def calc_output(self, out_scale):
-        range_v = (1 << (self.attrs['bit_length'] - 1)) - 1
-        return np.round(self.inputs['X'] / out_scale *
-                        range_v) * out_scale / range_v
-
-    def test_check_grad(self):
-        x = self.inputs["X"]
-        gradient = [np.ones(x.shape) / np.product(x.shape)]
-        self.check_grad(["X"], "Out", user_defined_grads=gradient)
-
-
-class TestFakeQuantDequantAbsOp(OpTest):
+class TestFakeQuantizeDequantizeAbsMaxOp(OpTest):
     def setUp(self):
-        self.op_type = "fake_quantize_dequantize_abs_max"
+        self.op_type = 'fake_quantize_dequantize_abs_max'
         self.attrs = {'bit_length': 8}
-        self.inputs = {'X': np.random.random((124, 240)).astype("float32"), }
-        scale = np.max(np.abs(self.inputs['X'])).astype("float32")
-        out_data = self.calc_output(scale)
+
+    def _fake_quantize_dequantize_abs_max(self, dtype, input_shape,
+                                          distribution):
+        input_data = distribution(input_shape).astype(dtype)
+        scale = np.max(np.abs(input_data)).astype(dtype)
+        bnt = (1 << (self.attrs['bit_length'] - 1)) - 1
+        output_data = round_c(input_data / scale * bnt) * scale / bnt
+        self.inputs = {'X': input_data}
         self.outputs = {
-            'Out': out_data,
-            'OutScale': np.array(scale).astype("float32"),
+            'Out': output_data,
+            'OutScale': np.array(scale).astype(dtype)
         }
-
-    def calc_output(self, scale):
-        range_v = (1 << (self.attrs['bit_length'] - 1)) - 1
-        return np.round(self.inputs['X'] / scale * range_v) * scale / range_v
-
-    def test_check_output(self):
+        self.dtype = dtype
         self.check_output()
+        gradient = [np.ones(input_data.shape) / np.product(input_data.shape)]
+        self.check_grad(['X'], 'Out', user_defined_grads=gradient)
 
-    def test_check_grad(self):
-        x = self.inputs["X"]
-        gradient = [np.ones(x.shape) / np.product(x.shape)]
-        self.check_grad(["X"], "Out", user_defined_grads=gradient)
+    def test_fake_quantize_dequantize_abs_max(self):
+        self._fake_quantize_dequantize_abs_max(np.float32, (124, 240),
+                                               np.random.random)
 
 
-class TestChannelWiseFakeQuantDequantOp(OpTest):
+class TestChannelWiseFakeQuantizeDequantizeAbsMaxOp(OpTest):
     def setUp(self):
-        self.set_arg()
-        assert self.quant_axis in [0, 1], "quant_axis should be 0 or 1."
-
-        self.op_type = "fake_channel_wise_quantize_dequantize_abs_max"
-        self.attrs = {'bit_length': 8, 'quant_axis': self.quant_axis}
-
-        scales = []
-        outputs = self.inputs['X'].copy()
-        range_v = (1 << (self.attrs['bit_length'] - 1)) - 1
-        if self.quant_axis == 0:
-            for i in range(self.inputs['X'].shape[0]):
-                scale_v = np.max(np.abs(self.inputs['X'][i])).astype("float32")
-                scales.append(scale_v)
-                outputs[i] = np.round(outputs[i] * range_v /
-                                      scale_v) * scale_v / range_v
-        elif self.quant_axis == 1:
-            for i in range(self.inputs['X'].shape[1]):
-                scale_v = np.max(np.abs(self.inputs['X'][:, i])).astype(
-                    "float32")
-                scales.append(scale_v)
-                outputs[:, i] = np.round(outputs[:, i] * range_v /
-                                         scale_v) * scale_v / range_v
-
-        self.outputs = {
-            'Out': outputs,
-            'OutScale': np.array(scales).astype("float32"),
-        }
-
-    def set_arg(self):
-        self.quant_axis = 0
-        self.inputs = {
-            'X': np.random.random((3, 4, 64, 64)).astype("float32"),
-        }
+        self.op_type = 'fake_channel_wise_quantize_dequantize_abs_max'
+        self.attrs = {'bit_length': 8}
 
-    def test_check_output(self):
+    def _fake_channel_wise_quantize_dequantize_abs_max(
+            self, dtype, input_shape, quant_axis, distribution):
+        assert quant_axis in [0, 1], 'quant_axis should be 0 or 1.'
+        input_data = distribution(input_shape).astype(dtype)
+        compute_type = get_compute_type(dtype)
+        bnt = (1 << (self.attrs['bit_length'] - 1)) - 1
+        output_data = input_data.copy().astype(compute_type)
+        compute_axis = tuple(
+            i for i in range(len(input_shape)) if i != quant_axis)
+        scale_broadcast = np.amax(input_data, axis=compute_axis, keepdims=True)
+        output_data = round_c(bnt * output_data /
+                              scale_broadcast) * scale_broadcast / bnt
+        if quant_axis == 1:
+            scale_broadcast = np.transpose(scale_broadcast,
+                                           (1, ) + compute_axis)
+        scale = scale_broadcast.reshape(input_shape[quant_axis], -1)[:, 0]
+        self.inputs = {'X': input_data}
+        self.outputs = {'Out': output_data, 'OutScale': scale}
+        self.dtype = dtype
+        self.attrs['quant_axis'] = quant_axis
         self.check_output()
+        gradient = [np.ones(input_data.shape) / np.product(input_data.shape)]
+        self.check_grad(['X'], 'Out', user_defined_grads=gradient)
 
-    def test_check_grad(self):
-        x = self.inputs["X"]
-        gradient = [np.ones(x.shape) / np.product(x.shape)]
-        self.check_grad(["X"], "Out", user_defined_grads=gradient)
-
-
-class TestChannelWiseFakeQuantDequantOp1(TestChannelWiseFakeQuantDequantOp):
-    def set_arg(self):
-        self.quant_axis = 1
-        self.inputs = {
-            'X': np.random.random((15, 20, 5, 5)).astype("float32"),
-        }
-
-
-class TestChannelWiseFakeQuantDequantOp2(TestChannelWiseFakeQuantDequantOp):
-    def set_arg(self):
-        self.quant_axis = 0
-        self.inputs = {'X': np.random.random((30, 15)).astype("float32"), }
-
-
-class TestChannelWiseFakeQuantDequantOp3(TestChannelWiseFakeQuantDequantOp):
-    def set_arg(self):
-        self.quant_axis = 1
-        self.inputs = {'X': np.random.random((30, 15)).astype("float32"), }
+    def test_channel_wise_fake_quant_dequant_abs_max(self):
+        input_shape_quant_axis_options = [[(3, 4, 64, 64), 0], [(
+            15, 20, 5, 5), 1], [(30, 15), 0], [(30, 15), 1]]
+        for input_shape, quant_axis in input_shape_quant_axis_options:
+            with self.subTest(input_shape=input_shape, quant_axis=quant_axis):
+                self._fake_channel_wise_quantize_dequantize_abs_max(
+                    np.float32, input_shape, quant_axis, np.random.random)
 
 
 def quantize_max_abs(x, max_range):
@@ -589,5 +462,5 @@ def test_check_output(self):
         self.check_output()
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_attention_op.py b/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
index 67160f59952ef..445620f9e1cb1 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
@@ -36,6 +36,18 @@ class TestFusedAttentionOp(OpTest):
     def setUp(self):
         self.config()
         self.generate_input_data()
+
+        self.rtol = 1e-5
+        # FIXME(limin29): Because there is a problem with the test precision
+        #  on A100, atol is temporarily set to 1e-2, and it will be
+        #  changed back after the precision problem is solved.
+        self.atol = 1e-2
+        # make sure local development precision
+        if "V100" in paddle.device.cuda.get_device_name():
+            self.atol = 1e-4
+        if self.x_type is np.float16:
+            self.atol = 1e-1
+
         paddle.set_default_dtype(self.x_type)
         self.__class__.op_type = "fused_attention"
         # use autograd to check grad in this unittest.
@@ -274,9 +286,9 @@ def test_fused_attention_op(self):
         final_out_ref, x_grad_ref = self.GetBaselineOut()
         final_out, x_grad = self.GetFusedAttentionOut()
         np.testing.assert_allclose(
-            final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-4)
+            final_out_ref, final_out.numpy(), rtol=self.rtol, atol=self.atol)
         np.testing.assert_allclose(
-            x_grad_ref, x_grad.numpy(), rtol=1e-5, atol=1e-4)
+            x_grad_ref, x_grad.numpy(), rtol=self.rtol, atol=self.atol)
 
 
 class TestFusedAttentionOpBiasIsNone(TestFusedAttentionOp):
@@ -307,9 +319,9 @@ def test_fused_attention_op(self):
         final_out_ref, x_grad_ref = self.GetBaselineOut()
         final_out, x_grad = self.GetFusedAttentionOut()
         np.testing.assert_allclose(
-            final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-1)
+            final_out_ref, final_out.numpy(), rtol=self.rtol, atol=self.atol)
         np.testing.assert_allclose(
-            x_grad_ref, x_grad.numpy(), rtol=1e-5, atol=1e-1)
+            x_grad_ref, x_grad.numpy(), rtol=self.rtol, atol=self.atol)
 
 
 class TestFusedAttentionOpCacheKV(TestFusedAttentionOp):
@@ -325,7 +337,10 @@ def test_fused_attention_op(self):
             final_out_ref = self.GetBaselineOut()
             final_out, cache_kv_out = self.GetFusedAttentionOut()
             np.testing.assert_allclose(
-                final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-4)
+                final_out_ref,
+                final_out.numpy(),
+                rtol=self.rtol,
+                atol=self.atol)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py b/python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py
index bdaf32ee0726d..74dc9351a25b4 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py
@@ -173,6 +173,17 @@ def setUp(self):
         self.config()
         self.generate_input_data()
 
+        self.rtol = 1e-5
+        # FIXME(limin29): Because there is a problem with the test precision
+        #  on A100, atol is temporarily set to 1e-2, and it will be
+        #  changed back after the precision problem is solved.
+        self.atol = 1e-2
+        # make sure local development precision
+        if "V100" in paddle.device.cuda.get_device_name():
+            self.atol = 1e-4
+        if self.x_type is np.float16:
+            self.atol = 1e-1
+
     def setAttnMask(self):
         self.has_attn_mask = True
 
@@ -256,7 +267,8 @@ def run_imperative(self):
             fused_attn.ln_scale.numpy(), fused_attn_ln_bias,
             fused_attn.qkv_weight.numpy(), fused_attn_qkv_bias,
             fused_attn.linear_weight.numpy(), fused_attn_linear_bias)
-        np.testing.assert_allclose(ref_out, out.numpy(), rtol=1e-5, atol=1e-4)
+        np.testing.assert_allclose(
+            ref_out, out.numpy(), rtol=self.rtol, atol=self.atol)
 
     def run_static(self):
         fused_attn = FusedMultiHeadAttention(
@@ -341,7 +353,7 @@ def test_static_api(self):
                                     self.attn_mask, ln_scale, ln_bias,
                                     ln_2_scale, ln_2_bias, qkv_weight, qkv_bias,
                                     linear_weight, linear_bias)
-        np.testing.assert_allclose(ref_out, out, rtol=1e-5, atol=1e-4)
+        np.testing.assert_allclose(ref_out, out, rtol=self.rtol, atol=self.atol)
 
     def test_dynamic_api(self):
         paddle.disable_static(place=paddle.CUDAPlace(0))
diff --git a/python/paddle/fluid/tests/unittests/test_fused_bias_dropout_residual_layer_norm_op.py b/python/paddle/fluid/tests/unittests/test_fused_bias_dropout_residual_layer_norm_op.py
new file mode 100644
index 0000000000000..d47450837a455
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fused_bias_dropout_residual_layer_norm_op.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+import paddle.fluid.core as core
+import paddle.nn.functional as F
+import paddle.incubate.nn.functional as incubate_f
+from paddle.nn.layer.norm import LayerNorm
+from paddle.nn.layer.common import Linear, Dropout
+from paddle.nn.layer.transformer import _convert_attention_mask
+from paddle import tensor
+from paddle.fluid import layers
+import unittest
+from op_test import OpTest
+from paddle.fluid.framework import default_main_program
+
+default_main_program().random_seed = 42
+
+
+class TestFusedBiasDropoutResidualLayerNormOp(OpTest):
+    def setUp(self):
+        self.config()
+        self.generate_input_data()
+        paddle.set_default_dtype(self.x_type)
+        self.__class__.op_type = "fused_bias_dropout_residual_layer_norm"
+        # use autograd to check grad in this unittest.
+        self.__class__.no_need_check_grad = True
+        paddle.set_default_dtype(np.float32)
+        self.norm1 = LayerNorm(self.embed_dim)
+        paddle.set_default_dtype(self.x_type)
+        self.dropout = Dropout(self.dropout_prob, mode="upscale_in_train")
+
+    def config(self):
+        self.x_type = np.float32
+        self.atol = 1e-4
+        self.training = True
+        self.batch_size = 8
+        self.query_length = 128
+        self.embed_dim = 1024
+        self.dropout_prob = 0.0
+        self.weight_attr = None
+        self.bias_attr = None
+
+    def generate_input_data(self):
+        self.x = np.random.rand(self.batch_size, self.query_length,
+                                self.embed_dim).astype(self.x_type)
+        self.residual = np.random.rand(self.batch_size, self.query_length,
+                                       self.embed_dim).astype(self.x_type)
+        self.linear_bias = np.random.rand(self.embed_dim).astype(self.x_type)
+        self.dout = np.random.random((self.batch_size, self.query_length,
+                                      self.embed_dim)).astype(self.x_type)
+
+        if self.bias_attr is False:
+            self.tensor_linear_bias = None
+        else:
+            self.tensor_linear_bias = paddle.to_tensor(
+                self.linear_bias, stop_gradient=False)
+
+        self.tensor_x = paddle.to_tensor(self.x, stop_gradient=False)
+        self.tensor_residual = paddle.to_tensor(
+            self.residual, stop_gradient=False)
+
+    def GetBaselineOut(self):
+        paddle.disable_static(place=paddle.CUDAPlace(0))
+
+        if self.tensor_linear_bias is not None:
+            out = self.tensor_x + self.tensor_linear_bias
+        else:
+            out = self.tensor_x
+
+        residual_out = self.tensor_residual + self.dropout(out)
+        final_out = self.norm1(residual_out)
+
+        paddle.autograd.backward(
+            [final_out], [paddle.to_tensor(self.dout)], retain_graph=True)
+
+        if self.tensor_linear_bias is not None:
+            tensor_linear_bias_grad = self.tensor_linear_bias.grad
+        else:
+            tensor_linear_bias_grad = None
+        return final_out, self.tensor_x.grad, self.tensor_residual.grad, tensor_linear_bias_grad
+
+    def GetFusedBiasDropoutResidualLayerNormOut(self):
+        paddle.disable_static(place=paddle.CUDAPlace(0))
+
+        ln_scale = paddle.to_tensor(self.norm1.weight, stop_gradient=False)
+        ln_bias = paddle.to_tensor(self.norm1.bias, stop_gradient=False)
+        epsilon = 1e-05
+
+        final_out = incubate_f.fused_bias_dropout_residual_layer_norm(
+            self.tensor_x, self.tensor_residual, self.tensor_linear_bias,
+            ln_scale, ln_bias, self.dropout_prob, epsilon)
+
+        paddle.autograd.backward(
+            [final_out], [paddle.to_tensor(self.dout)], retain_graph=True)
+        if self.tensor_linear_bias is not None:
+            tensor_linear_bias_grad = self.tensor_linear_bias.grad
+        else:
+            tensor_linear_bias_grad = None
+        return final_out, self.tensor_x.grad, self.tensor_residual.grad, tensor_linear_bias_grad
+
+    def test_fused_op(self):
+        out_ref, x_grad_ref, residual_grad_ref, linear_bias_grad_ref = self.GetBaselineOut(
+        )
+        out, x_grad, residual_grad, linear_bias_grad = self.GetFusedBiasDropoutResidualLayerNormOut(
+        )
+        np.testing.assert_allclose(
+            out_ref, out.numpy(), rtol=1e-5, atol=self.atol)
+        np.testing.assert_allclose(
+            x_grad_ref, x_grad.numpy(), rtol=1e-5, atol=self.atol)
+        np.testing.assert_allclose(
+            residual_grad_ref, residual_grad.numpy(), rtol=1e-5, atol=self.atol)
+        if linear_bias_grad_ref is not None:
+            np.testing.assert_allclose(
+                linear_bias_grad_ref,
+                linear_bias_grad.numpy(),
+                rtol=1e-5,
+                atol=self.atol)
+
+
+class TestFusedBiasDropoutResidualLayerNormOpBiasIsNone(
+        TestFusedBiasDropoutResidualLayerNormOp):
+    def config(self):
+        super().config()
+        self.bias_attr = False
+
+
+class TestFusedBiasDropoutResidualLayerNormOpFp16(
+        TestFusedBiasDropoutResidualLayerNormOp):
+    def config(self):
+        super().config()
+        self.x_type = np.float16
+        self.atol = 1e-1
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_bias_dropout_residual_layer_norm_op_api.py b/python/paddle/fluid/tests/unittests/test_fused_bias_dropout_residual_layer_norm_op_api.py
new file mode 100644
index 0000000000000..19fc3972e58d4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fused_bias_dropout_residual_layer_norm_op_api.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+import paddle.fluid.core as core
+import paddle.nn.functional as F
+from paddle.incubate.nn.layer.fused_transformer import FusedBiasDropoutResidualLayerNorm
+from paddle import tensor
+from paddle.fluid import layers
+from paddle.static import Program, program_guard
+import unittest
+
+
+def layer_norm(x, has_scale, has_bias, weight, bias, epsilon=1e-05):
+    batch_size, src_len, d_model = x.shape
+    x = x.reshape((batch_size * src_len, d_model))
+    mu = np.mean(x, axis=1, keepdims=True)
+    sigma_squar = np.sum(np.square(x - mu), axis=1) / d_model
+    x1_up = (x - mu)
+    x1_down_1 = sigma_squar + epsilon
+    x1_down = np.sqrt(x1_down_1)
+    x1_down = x1_down.reshape((x1_down.shape[0], 1))
+    x1 = x1_up / x1_down
+    x_scaled = x1
+    if (has_scale):
+        x_scaled = weight * x1
+    x_scaled_bias = x_scaled
+    if (has_bias):
+        x_scaled_bias = x_scaled + bias
+    x_scaled_bias = x_scaled_bias.reshape((batch_size, src_len, d_model))
+    return x_scaled_bias
+
+
+def compute_reference(x, residual, ln_scale, ln_bias, linear_bias):
+    batch_size = x.shape[0]
+    seq_len = x.shape[1]
+    embed_dim = x.shape[2]
+
+    has_bias = True
+    if ln_bias is None:
+        has_bias = False
+    # bias add, dropout, residual add, layer_norm.
+    if linear_bias is not None:
+        linear_bias_out = x + linear_bias
+    else:
+        linear_bias_out = x
+    linear_bias_dropout_out = linear_bias_out
+    linear_bias_dropout_residual_out = residual + linear_bias_dropout_out
+    linear_bias_dropout_residual_ln_out = layer_norm(
+        linear_bias_dropout_residual_out, True, has_bias, ln_scale, ln_bias)
+    return linear_bias_dropout_residual_ln_out
+
+
+class TestFusedBiasDropoutResidualLayerNormAPI(unittest.TestCase):
+    def setUp(self):
+        self.setXType()
+        self.setBiasAttr()
+        self.config()
+        self.generate_input_data()
+
+    def setBiasAttr(self):
+        self.bias_attr = None
+
+    def setXType(self):
+        self.x_type = np.float32
+        self.atol = 1e-4
+
+    def config(self):
+        self.training = True
+        self.batch_size = 1
+        self.query_length = 2
+        self.embed_dim = 4
+        self.dropout_prob = 0.0
+        self.weight_attr = None
+
+    def generate_input_data(self):
+        self.x = np.random.rand(self.batch_size, self.query_length,
+                                self.embed_dim).astype(self.x_type)
+        self.residual = np.random.rand(self.batch_size, self.query_length,
+                                       self.embed_dim).astype(self.x_type)
+
+    def run_imperative(self):
+        fused_bias_dropout_residual_ln = FusedBiasDropoutResidualLayerNorm(
+            self.embed_dim, self.dropout_prob, self.weight_attr, self.bias_attr)
+
+        linear_bias = None
+        if self.bias_attr is not False:
+            linear_bias = np.random.random(fused_bias_dropout_residual_ln.
+                                           linear_bias.shape).astype('float32')
+            fused_bias_dropout_residual_ln.linear_bias.set_value(
+                paddle.to_tensor(linear_bias))
+        out = fused_bias_dropout_residual_ln(
+            paddle.to_tensor(self.x), paddle.to_tensor(self.residual))
+
+        ln_bias = None
+        if self.bias_attr is not False:
+            ln_bias = fused_bias_dropout_residual_ln.ln_bias.numpy()
+        ln_scale = fused_bias_dropout_residual_ln.ln_scale.numpy(),
+        ref_out = compute_reference(self.x, self.residual, ln_scale, ln_bias,
+                                    linear_bias)
+        np.testing.assert_allclose(
+            ref_out, out.numpy(), rtol=1e-5, atol=self.atol)
+
+    def run_static(self):
+        fused_op = FusedBiasDropoutResidualLayerNorm(
+            self.embed_dim, self.dropout_prob, self.weight_attr, self.bias_attr)
+
+        x = paddle.static.data(
+            name='X',
+            shape=[self.batch_size, self.query_length, self.embed_dim],
+            dtype=self.x_type)
+        residual = paddle.static.data(
+            name='Residual',
+            shape=[self.batch_size, self.query_length, self.embed_dim],
+            dtype=self.x_type)
+        final_out = fused_op(x, residual)
+
+        place = paddle.CUDAPlace(0)
+        exe = paddle.static.Executor(place)
+        exe.run(paddle.static.default_startup_program())
+
+        linear_bias = None
+        ln_bias = None
+        if self.bias_attr is False:
+            out, ln_scale = exe.run(
+                paddle.static.default_main_program(),
+                feed={"X": self.x,
+                      "Residual": self.residual},
+                fetch_list=[final_out, fused_op.ln_scale])
+        else:
+            out, linear_bias, ln_scale, ln_bias = exe.run(
+                paddle.static.default_main_program(),
+                feed={"X": self.x,
+                      "Residual": self.residual},
+                fetch_list=[
+                    final_out, fused_op.linear_bias, fused_op.ln_scale,
+                    fused_op.ln_bias
+                ])
+        return out, linear_bias, ln_scale, ln_bias
+
+    def test_static_api(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(Program()):
+            out, linear_bias, ln_scale, ln_bias = self.run_static()
+        ref_out = compute_reference(self.x, self.residual, ln_scale, ln_bias,
+                                    linear_bias)
+        np.testing.assert_allclose(ref_out, out, rtol=1e-5, atol=self.atol)
+
+    def test_dynamic_api(self):
+        paddle.disable_static(place=paddle.CUDAPlace(0))
+        self.run_imperative()
+
+
+class TestFusedBiasDropoutResidualLayerNormAPIBiasIsNone(
+        TestFusedBiasDropoutResidualLayerNormAPI):
+    def setBiasAttr(self):
+        self.bias_attr = False
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py b/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py
index 8c68eb243aea8..25336efd6a7fb 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py
@@ -40,7 +40,12 @@ def getShape(self):
 
     def getDiff(self):
         self.rtol = 1e-3
-        self.atol = 1e-4
+        # FIXME(limin29): Because there is a problem with the test precision
+        #  on A100, atol is temporarily set to 1e-2, and it will be
+        #  changed back after the precision problem is solved.
+        self.atol = 1e-2
+        if "V100" in paddle.device.cuda.get_device_name():
+            self.atol = 1e-4
 
     def getActivation(self):
         self.act_method = "gelu"
diff --git a/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py b/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py
new file mode 100644
index 0000000000000..6f9ba5f5e4e57
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py
@@ -0,0 +1,252 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+from paddle import tensor
+import unittest
+from op_test import OpTest, convert_float_to_uint16
+from test_sparse_attention_op import get_cuda_version
+from paddle import _C_ops
+from paddle.fluid.framework import default_main_program
+from paddle.fluid import core
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "Paddle is not compiled with CUDA")
+class TestFusedGateAttentionOp(OpTest):
+    def setUp(self):
+        self.__class__.op_type = "fused_gate_attention"
+        # use autograd to check grad in this unittest.
+        self.__class__.no_need_check_grad = True
+        self.config()
+        self.merge_qkv = self.q_dim == self.kv_dim
+        self.generate_input_data()
+
+    def config(self):
+        self.dtype = "float32"
+        self.has_gating = True
+        self.batch_size = 1
+        self.msa_len = 3
+        self.res_len = 5
+        self.q_dim = 6
+        self.num_heads = 2
+        self.key_dim = 4
+        self.m_size = self.res_len
+        self.kv_dim = self.q_dim
+        self.out_dim = self.q_dim
+        self.bias_attr = True
+
+    def generate_input_data(self):
+        def _random(shape):
+            if self.dtype == "bfloat16":
+                data = np.random.random(shape).astype("float32")
+                return convert_float_to_uint16(data)
+            else:
+                return np.random.random(shape).astype(self.dtype)
+
+        np.random.seed(123)
+        self.query = _random(
+            (self.batch_size, self.msa_len, self.res_len, self.q_dim))
+        self.q_weight = _random((self.q_dim, self.num_heads, self.key_dim))
+        self.k_weight = _random((self.kv_dim, self.num_heads, self.key_dim))
+        self.v_weight = _random((self.kv_dim, self.num_heads, self.key_dim))
+        if self.merge_qkv:
+            self.key = None
+            # (3, self.num_heads, self.key_dim, self.q_dim)
+            q_weight_t = np.transpose(self.q_weight, axes=[1, 2, 0])
+            k_weight_t = np.transpose(self.k_weight, axes=[1, 2, 0])
+            v_weight_t = np.transpose(self.v_weight, axes=[1, 2, 0])
+            self.qkv_weight = np.stack([q_weight_t, k_weight_t, v_weight_t])
+        else:
+            self.key = _random(
+                (self.batch_size, self.msa_len, self.m_size, self.kv_dim))
+            self.qkv_weight = None
+
+        self.attn_mask = _random(
+            (self.batch_size, self.msa_len, 1, 1, self.m_size))
+
+        if self.bias_attr:
+            self.nonbatched_bias = _random(
+                (self.batch_size, 1, self.num_heads, self.res_len, self.m_size))
+
+        if self.has_gating:
+            self.gating_w = _random((self.q_dim, self.num_heads, self.key_dim))
+            self.gating_b = _random((self.num_heads, self.key_dim))
+
+        self.output_w = _random((self.num_heads, self.key_dim, self.out_dim))
+        self.output_b = _random((self.out_dim))
+
+        self.dout = _random(
+            (self.batch_size, self.msa_len, self.res_len, self.q_dim))
+
+    def get_reference_out(self):
+        paddle.disable_static(place=paddle.CUDAPlace(0))
+
+        query = paddle.to_tensor(self.query, stop_gradient=False)
+        key = query if self.merge_qkv else paddle.to_tensor(
+            self.key, stop_gradient=False)
+        q_weight = paddle.to_tensor(self.q_weight, stop_gradient=False)
+        k_weight = paddle.to_tensor(self.k_weight, stop_gradient=False)
+        v_weight = paddle.to_tensor(self.v_weight, stop_gradient=False)
+        src_mask = paddle.to_tensor(self.attn_mask, stop_gradient=True)
+
+        c = self.key_dim**(-0.5)
+        # [batch_size, msa_len, num_heads, res_len, key_dim]
+        q = paddle.einsum('nbqa,ahc->nbqhc', query, q_weight) * c
+        # [batch_size, msa_len, num_heads, m_size, key_dim]
+        k = paddle.einsum('nbka,ahc->nbkhc', key, k_weight)
+        # [batch_size, msa_len, num_heads, m_size, key_dim]
+        v = paddle.einsum('nbka,ahc->nbkhc', key, v_weight)
+
+        # [batch_size, msa_len, num_heads, res_len, m_size] 
+        logits = paddle.einsum('nbqhc,nbkhc->nbhqk', q, k)  # qk_out
+        logits = logits + src_mask
+        if self.bias_attr:
+            nonbatched_bias = paddle.to_tensor(
+                self.nonbatched_bias, stop_gradient=False)
+            logits = logits + nonbatched_bias
+
+        weights = nn.functional.softmax(logits)  # softmax_out
+        weighted_avg = paddle.einsum('nbhqk,nbkhc->nbqhc', weights, v)
+
+        if self.has_gating:
+            gating_w = paddle.to_tensor(self.gating_w, stop_gradient=False)
+            gating_b = paddle.to_tensor(self.gating_b, stop_gradient=False)
+            gate_values = paddle.einsum('nbqc,chv->nbqhv', query,
+                                        gating_w) + gating_b
+            gate_values = nn.functional.sigmoid(gate_values)
+            weighted_avg = weighted_avg * gate_values
+
+        output_b = paddle.to_tensor(self.output_b, stop_gradient=False)
+        output_w = paddle.to_tensor(self.output_w, stop_gradient=False)
+
+        out = paddle.einsum('nbqhc,hco->nbqo', weighted_avg,
+                            output_w) + output_b
+        paddle.autograd.backward(
+            [out], [paddle.to_tensor(self.dout)], retain_graph=True)
+        if self.merge_qkv:
+            return out, query.grad, None
+        else:
+            return out, query.grad, key.grad
+
+    def get_fused_gate_attention_out(self):
+        paddle.disable_static(place=paddle.CUDAPlace(0))
+
+        query = paddle.to_tensor(self.query, stop_gradient=False)
+        if self.merge_qkv:
+            key = None
+            q_weight = None
+            k_weight = None
+            v_weight = None
+            qkv_weight = paddle.to_tensor(self.qkv_weight, stop_gradient=False)
+        else:
+            key = paddle.to_tensor(self.key, stop_gradient=False)
+            q_weight = paddle.to_tensor(self.q_weight, stop_gradient=False)
+            k_weight = paddle.to_tensor(self.k_weight, stop_gradient=False)
+            v_weight = paddle.to_tensor(self.v_weight, stop_gradient=False)
+            qkv_weight = None
+
+        src_mask = paddle.to_tensor(self.attn_mask, stop_gradient=True)
+
+        if self.bias_attr:
+            nonbatched_bias = paddle.to_tensor(
+                self.nonbatched_bias, stop_gradient=False)
+        else:
+            nonbatched_bias = None
+        if self.has_gating:
+            gating_w = paddle.to_tensor(self.gating_w, stop_gradient=False)
+            gating_b = paddle.to_tensor(self.gating_b, stop_gradient=False)
+        else:
+            gating_w = None
+            gating_b = None
+
+        output_w = paddle.to_tensor(self.output_w, stop_gradient=False)
+        output_b = paddle.to_tensor(self.output_b, stop_gradient=False)
+
+        _, _, _, _, _, _, _, out = _C_ops.fused_gate_attention(
+            query, key, q_weight, k_weight, v_weight, qkv_weight,
+            nonbatched_bias, src_mask, gating_w, gating_b, output_w, output_b,
+            'has_gating', self.has_gating, 'merge_qkv', self.merge_qkv)
+
+        paddle.autograd.backward(
+            [out], [paddle.to_tensor(self.dout)], retain_graph=True)
+        if key is not None:
+            return out, query.grad, key.grad
+        else:
+            return out, query.grad, None
+
+    def check_output_and_grad(self, atol, rtol):
+        out_ref, query_grad_ref, key_grad_ref = self.get_reference_out()
+        out, query_grad, key_grad = self.get_fused_gate_attention_out()
+        np.testing.assert_allclose(out_ref, out.numpy(), atol=atol, rtol=rtol)
+        np.testing.assert_allclose(
+            query_grad_ref, query_grad.numpy(), atol=atol, rtol=rtol)
+        if key_grad_ref is not None and key_grad is not None:
+            np.testing.assert_allclose(
+                key_grad_ref, key_grad.numpy(), atol=atol, rtol=rtol)
+
+    def test_output_and_grad(self):
+        self.check_output_and_grad(atol=1e-5, rtol=1e-5)
+
+
+class TestSeparatedQKVCase(TestFusedGateAttentionOp):
+    def config(self):
+        self.dtype = "float32"
+        self.has_gating = False
+        self.batch_size = 1
+        self.msa_len = 3
+        self.res_len = 5
+        self.q_dim = 6
+        self.num_heads = 2
+        self.key_dim = 4
+        self.m_size = 4
+        self.kv_dim = 2
+        self.out_dim = self.q_dim
+        self.bias_attr = False
+
+
+class TestMergeQKVNoBiasGatingCase(TestFusedGateAttentionOp):
+    def config(self):
+        super().config()
+        self.has_gating = False
+        self.bias_attr = False
+
+
+class TestMergeQKVFp16Case(TestFusedGateAttentionOp):
+    def config(self):
+        super().config()
+        self.dtype = "float16"
+
+    def test_output_and_grad(self):
+        self.check_output_and_grad(atol=1e-1, rtol=1e-5)
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda() or get_cuda_version() < 11000,
+    "core is not compiled with CUDA and cuda version need larger than or equal to 11.3"
+)
+class TestMergeQKVBF16Case(TestFusedGateAttentionOp):
+    def config(self):
+        super().config()
+        self.dtype = "bfloat16"
+
+    def test_output_and_grad(self):
+        self.check_output_and_grad(atol=1e-1, rtol=1e-3)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_grad_op.py b/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_grad_op.py
index 2ea1bf2e9cb81..106ce5b4ef055 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_grad_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_grad_op.py
@@ -235,5 +235,6 @@ def init_dtype_type(self):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     np.random.seed(0)
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_op.py b/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_op.py
index f826898f9e5dd..4256945a1e8d5 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_op.py
@@ -19,7 +19,7 @@
 import numpy as np
 import paddle
 import paddle.fluid.core as core
-from op_test import OpTest, skip_check_grad_ci
+from op_test import OpTest, skip_check_grad_ci, skip_check_inplace_ci
 
 
 def gelu(x):
@@ -43,10 +43,15 @@ def get_output(X, Y, bias, act):
         return out
 
 
+@skip_check_inplace_ci(reason="no inplace op")
+class TestFuseGemmBase(OpTest):
+    pass
+
+
 @skip_check_grad_ci(reason="no grap op")
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
-class TestFuseGemmEpilogueOpReluMMFP16(OpTest):
+class TestFuseGemmEpilogueOpReluMMFP16(TestFuseGemmBase):
     def setUp(self):
         self.op_type = "fused_gemm_epilogue"
         self.place = core.CUDAPlace(0)
@@ -95,7 +100,7 @@ def init_dtype_type(self):
 @skip_check_grad_ci(reason="no grap op")
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
-class TestFuseGemmEpilogueOpReluMTMFP16(OpTest):
+class TestFuseGemmEpilogueOpReluMTMFP16(TestFuseGemmBase):
     def setUp(self):
         self.op_type = "fused_gemm_epilogue"
         self.place = core.CUDAPlace(0)
@@ -144,7 +149,7 @@ def init_dtype_type(self):
 @skip_check_grad_ci(reason="no grap op")
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
-class TestFuseGemmEpilogueOpReluMMTFP16(OpTest):
+class TestFuseGemmEpilogueOpReluMMTFP16(TestFuseGemmBase):
     def setUp(self):
         self.op_type = "fused_gemm_epilogue"
         self.place = core.CUDAPlace(0)
@@ -193,7 +198,7 @@ def init_dtype_type(self):
 @skip_check_grad_ci(reason="no grap op")
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
-class TestFuseGemmEpilogueOpReluMTMTFP16(OpTest):
+class TestFuseGemmEpilogueOpReluMTMTFP16(TestFuseGemmBase):
     def setUp(self):
         self.op_type = "fused_gemm_epilogue"
         self.place = core.CUDAPlace(0)
@@ -242,7 +247,7 @@ def init_dtype_type(self):
 @skip_check_grad_ci(reason="no grap op")
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
-class TestFuseGemmEpilogueOpReluMMFP16MultiDimX(OpTest):
+class TestFuseGemmEpilogueOpReluMMFP16MultiDimX(TestFuseGemmBase):
     def setUp(self):
         self.op_type = "fused_gemm_epilogue"
         self.place = core.CUDAPlace(0)
@@ -294,7 +299,7 @@ def init_dtype_type(self):
 @skip_check_grad_ci(reason="no grap op")
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
-class TestFuseGemmEpilogueOpReluMTMFP16MultiDimX(OpTest):
+class TestFuseGemmEpilogueOpReluMTMFP16MultiDimX(TestFuseGemmBase):
     def setUp(self):
         self.op_type = "fused_gemm_epilogue"
         self.place = core.CUDAPlace(0)
@@ -346,7 +351,7 @@ def init_dtype_type(self):
 @skip_check_grad_ci(reason="no grap op")
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
-class TestFuseGemmEpilogueOpGeluMMFP16(OpTest):
+class TestFuseGemmEpilogueOpGeluMMFP16(TestFuseGemmBase):
     def setUp(self):
         self.op_type = "fused_gemm_epilogue"
         self.place = core.CUDAPlace(0)
@@ -397,7 +402,7 @@ def init_dtype_type(self):
 @skip_check_grad_ci(reason="no grap op")
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
-class TestFuseGemmEpilogueOpNoneMMFP16(OpTest):
+class TestFuseGemmEpilogueOpNoneMMFP16(TestFuseGemmBase):
     def setUp(self):
         self.op_type = "fused_gemm_epilogue"
         self.place = core.CUDAPlace(0)
@@ -446,5 +451,6 @@ def init_dtype_type(self):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     np.random.seed(0)
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_matmul_bias.py b/python/paddle/fluid/tests/unittests/test_fused_matmul_bias.py
new file mode 100644
index 0000000000000..98548c9996588
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fused_matmul_bias.py
@@ -0,0 +1,162 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid.core as core
+import unittest
+import numpy as np
+from paddle.incubate.nn.functional import fused_matmul_bias, fused_linear
+from paddle.incubate.nn import FusedLinear
+
+
+def is_fused_matmul_bias_supported():
+    if paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm():
+        return hasattr(core.ops, 'fused_gemm_epilogue')
+    else:
+        return False
+
+
+def matmul(x, y, bias, trans_x, trans_y):
+    x = np.array(x)
+    if trans_x:
+        x = np.ascontiguousarray(np.transpose(x))
+    if trans_y:
+        y = np.ascontiguousarray(np.transpose(y))
+    z = np.matmul(x, y)
+    if bias is None:
+        return z
+    else:
+        return z + bias
+
+
+def matmul_grad(x, y, bias, dz, trans_x, trans_y):
+    if trans_x:
+        if trans_y:
+            dx = matmul(y, dz, None, True, True)
+            dy = matmul(dz, x, None, True, True)
+        else:
+            dx = matmul(y, dz, None, False, True)
+            dy = matmul(x, dz, None, False, False)
+    else:
+        if trans_y:
+            dx = matmul(dz, y, None, False, False)
+            dy = matmul(dz, x, None, True, False)
+        else:
+            dx = matmul(dz, y, None, False, True)
+            dy = matmul(x, dz, None, True, False)
+    if bias is None:
+        dbias = None
+    else:
+        dbias = np.sum(dz, axis=0, keepdims=False)
+    return dx, dy, dbias
+
+
+@unittest.skipIf(
+    not is_fused_matmul_bias_supported(),
+    "fused_gemm_epilogue is only supported when CUDA version >= 11.6")
+class TestFusedMatmulBias(unittest.TestCase):
+    def setUp(self):
+        paddle.set_device('gpu')
+
+    def rand_data(self, shape, dtype):
+        return np.random.randint(low=-20, high=20, size=shape).astype(dtype)
+
+    def rand_test_base(self, m, n, k, trans_x, trans_y, need_bias, dtype, seed):
+        np.random.seed(seed)
+        x_shape = [k, m] if trans_x else [m, k]
+        y_shape = [n, k] if trans_y else [k, n]
+        bias_shape = [n]
+
+        x_np = self.rand_data(x_shape, dtype)
+        x = paddle.to_tensor(x_np)
+        x.stop_gradient = False
+
+        y_np = self.rand_data(y_shape, dtype)
+        y = paddle.to_tensor(y_np)
+        y.stop_gradient = False
+
+        if need_bias:
+            bias_np = self.rand_data(bias_shape, dtype)
+            bias = paddle.to_tensor(bias_np)
+            bias.stop_gradient = False
+        else:
+            bias_np = None
+            bias = None
+
+        z = fused_matmul_bias(x, y, bias, trans_x, trans_y)
+        z_np = matmul(x_np, y_np, bias_np, trans_x, trans_y)
+        self.assertTrue(np.array_equal(z.numpy(), z_np))
+
+        z_grad_np = self.rand_data(z_np.shape, dtype)
+        paddle.autograd.backward(z, grad_tensors=[paddle.to_tensor(z_grad_np)])
+
+        x_grad_np, y_grad_np, bias_grad_np = matmul_grad(
+            x_np, y_np, bias_np, z_grad_np, trans_x, trans_y)
+        self.assertTrue(np.array_equal(x.grad.numpy(), x_grad_np))
+        self.assertEqual(y_grad_np.shape, y_np.shape)
+        self.assertTrue(np.array_equal(y.grad.numpy(), y_grad_np))
+
+        if need_bias:
+            self.assertTrue(np.array_equal(bias.grad.numpy(), bias_grad_np))
+        else:
+            self.assertTrue(bias_grad_np is None)
+
+    def rand_test(self, m, n, k, dtype):
+        seed = int(np.random.randint(low=0, high=1000, size=[1]))
+        for trans_x in [False, True]:
+            for trans_y in [False, True]:
+                for need_bias in [False, True]:
+                    self.rand_test_base(m, n, k, trans_x, trans_y, need_bias,
+                                        dtype, seed)
+
+    def test_fp32(self):
+        self.rand_test(30, 40, 50, np.float32)
+
+    def test_fp16(self):
+        self.rand_test(4, 5, 7, np.float16)
+
+
+@unittest.skipIf(
+    not is_fused_matmul_bias_supported(),
+    "fused_gemm_epilogue is only supported when CUDA version >= 11.6")
+class TestFusedLinear(unittest.TestCase):
+    def check_fused_linear(self, transpose):
+        x = paddle.randn([30, 40])
+        linear = FusedLinear(40, 50, transpose_weight=transpose)
+        y1 = linear(x)
+        y2 = fused_linear(x, linear.weight, linear.bias, transpose)
+        self.assertTrue(np.array_equal(y1.numpy(), y2.numpy()))
+
+    def test_non_transpose(self):
+        self.check_fused_linear(False)
+
+    def test_transpose(self):
+        self.check_fused_linear(True)
+
+
+@unittest.skipIf(
+    not is_fused_matmul_bias_supported(),
+    "fused_gemm_epilogue is only supported when CUDA version >= 11.6")
+class TestStaticGraph(unittest.TestCase):
+    def test_static_graph(self):
+        paddle.enable_static()
+        x = paddle.static.data(name='x', dtype='float32', shape=[-1, 100])
+        linear = FusedLinear(100, 300)
+        y = linear(x)
+        self.assertEqual(list(y.shape), [-1, 300])
+        paddle.disable_static()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_transformer_encoder_layer.py b/python/paddle/fluid/tests/unittests/test_fused_transformer_encoder_layer.py
index 7dc86d0dea382..843b495e85b9a 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_transformer_encoder_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_transformer_encoder_layer.py
@@ -49,6 +49,14 @@ def setUp(self):
         self.setPreLayerNorm()
         self.setAttnMask()
 
+        self.rtol = 1e-3
+        # FIXME(limin29): Because there is a problem with the test precision
+        #  on A100, atol is temporarily set to 1e-2, and it will be
+        #  changed back after the precision problem is solved.
+        self.atol = 1e-2
+        if "V100" in paddle.device.cuda.get_device_name():
+            self.atol = 1e-4
+
     def fused_weight(self, weight, num_head):
         a = paddle.transpose(weight, perm=[1, 0])
         return paddle.reshape(
@@ -151,13 +159,13 @@ def test_out(self):
         self.assertTrue(fused_encoder.fused_attn.extra_repr(), correct_attn_str)
 
         np.testing.assert_allclose(
-            fused_out.numpy(), base_out.numpy(), rtol=1e-3, atol=1e-4)
+            fused_out.numpy(), base_out.numpy(), rtol=self.rtol, atol=self.atol)
         self.assertTrue(
             np.allclose(
                 fused_out.grad.numpy(),
                 base_out.grad.numpy(),
-                rtol=1e-3,
-                atol=1e-4))
+                rtol=self.rtol,
+                atol=self.atol))
 
 
 class TestFusedTransformerEncoderLayerAct(TestFusedTransformerEncoderLayer):
diff --git a/python/paddle/fluid/tests/unittests/test_instance_norm_op.py b/python/paddle/fluid/tests/unittests/test_instance_norm_op.py
index aa184dd42e6fc..23c514334765d 100644
--- a/python/paddle/fluid/tests/unittests/test_instance_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_instance_norm_op.py
@@ -22,6 +22,7 @@
 from op_test import OpTest
 from paddle.fluid import Program, program_guard
 from paddle.fluid.dygraph import to_variable
+from paddle.fluid.framework import _test_eager_guard
 
 
 def _reference_instance_norm_naive(x, scale, bias, epsilon, mean, var):
@@ -253,6 +254,10 @@ def test_norm(self):
                 outputs = instance_norm(to_variable(inputs))
                 self.assertTrue(np.allclose(outputs.numpy(), out_np, atol=1e-6))
 
+    def test_eager_api(self):
+        with _test_eager_guard():
+            self.test_norm()
+
 
 class TestElasticNormOpCase2(unittest.TestCase):
     def init_test_case(self):
@@ -282,6 +287,10 @@ def test_norm(self):
                 outputs = instance_norm(to_variable(inputs))
                 self.assertTrue(np.allclose(outputs.numpy(), out_np, atol=1e-6))
 
+    def test_eager_api(self):
+        with _test_eager_guard():
+            self.test_norm()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
index 102e08e36a9e5..1656bc11869fd 100644
--- a/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
@@ -22,6 +22,7 @@
 from paddle.fluid.framework import grad_var_name
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
+from paddle.fluid.framework import _test_eager_guard
 import paddle
 
 
@@ -116,6 +117,11 @@ def compute_v2(x_np):
             y2 = compute_v2(x)
             self.assertTrue(np.allclose(y1, y2))
 
+    def test_eager_api(self):
+        with _test_eager_guard():
+            self.test_dygraph()
+            self.test_error()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
index cad6437d1d3e3..21844c9e402ad 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
@@ -48,6 +48,7 @@ def test_main(self):
 class TestLookupTableOp(OpTest):
     def setUp(self):
         self.op_type = "lookup_table_v2"
+        self.python_api = paddle.nn.functional.embedding
         table = np.random.random((17, 31)).astype("float64")
         ids = np.random.randint(0, 17, 4).astype(self.id_dtype())
         self.inputs = {'W': table, 'Ids': ids}
@@ -57,10 +58,10 @@ def id_dtype(self):
         return "int64"
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'))
+        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'), check_eager=True)
 
 
 class TestLookupTableOpInt16(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/test_nanmedian.py b/python/paddle/fluid/tests/unittests/test_nanmedian.py
new file mode 100644
index 0000000000000..2e1f13a8c7d9f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_nanmedian.py
@@ -0,0 +1,196 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+
+np.random.seed(102)
+
+
+class TestNanmedian(unittest.TestCase):
+    def setUp(self):
+        single_axis_shape = (120)
+        multi_axis_shape = (2, 3, 4, 5)
+
+        self.fake_data = {
+            "single_axis_normal":
+            np.random.uniform(-1, 1, single_axis_shape).astype(np.float32),
+            "multi_axis_normal":
+            np.random.uniform(-1, 1, multi_axis_shape).astype(np.float32),
+            "single_axis_all_nan": np.full(single_axis_shape, np.nan),
+            "multi_axis_all_nan": np.full(multi_axis_shape, np.nan),
+        }
+
+        single_partial_nan = self.fake_data["single_axis_normal"].copy()
+        single_partial_nan[single_partial_nan > 0] = np.nan
+        multi_partial_nan = self.fake_data["multi_axis_normal"].copy()
+        multi_partial_nan[multi_partial_nan > 0] = np.nan
+        self.fake_data["single_axis_partial_nan"] = single_partial_nan
+        self.fake_data["multi_axis_partial_nan"] = multi_partial_nan
+
+        row_data = np.random.uniform(-1, 1, multi_axis_shape).astype(np.float32)
+        row_data[:, :, :, 0] = np.nan
+        row_data[:, :, :2, 1] = np.nan
+        row_data[:, :, 2:, 2] = np.nan
+        self.fake_data["row_nan_even"] = row_data
+        self.fake_data["row_nan_float64"] = row_data.astype(np.float64)
+        self.fake_data["row_nan_int64"] = row_data.astype(np.int64)
+        self.fake_data["row_nan_int32"] = row_data.astype(np.int32)
+
+        col_data = np.random.uniform(-1, 1, multi_axis_shape).astype(np.float32)
+        col_data[:, :, 0, :] = np.nan
+        col_data[:, :, 1, :3] = np.nan
+        col_data[:, :, 2, 3:] = np.nan
+        self.fake_data["col_nan_odd"] = col_data
+
+        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+        self.axis_candiate_list = [
+            None, 0, 2, -1, -2, (1, 2), [0, -1], [0, 1, 3], (1, 2, 3),
+            [0, 2, 1, 3]
+        ]
+
+    def test_api_static(self):
+        data = self.fake_data["col_nan_odd"]
+        paddle.enable_static()
+        np_res = np.nanmedian(data, keepdims=True)
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.fluid.data('X', data.shape)
+            out1 = paddle.nanmedian(x, keepdim=True)
+            out2 = paddle.tensor.nanmedian(x, keepdim=True)
+            out3 = paddle.tensor.stat.nanmedian(x, keepdim=True)
+            axis = np.arange(len(data.shape)).tolist()
+            out4 = paddle.nanmedian(x, axis=axis, keepdim=True)
+            out5 = paddle.nanmedian(x, axis=tuple(axis), keepdim=True)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': data},
+                          fetch_list=[out1, out2, out3, out4, out5])
+
+        for out in res:
+            self.assertTrue(np.allclose(np_res, out, equal_nan=True))
+
+    def test_api_dygraph(self):
+        paddle.disable_static(self.place)
+
+        def clean_axis_numpy(axis, shape_len):
+            if isinstance(axis, tuple):
+                axis = list(axis)
+            if isinstance(axis, list):
+                for k in range(len(axis)):
+                    if axis[k] < 0:
+                        axis[k] += shape_len
+                axis = set(axis)
+            return axis
+
+        def test_data_case(data):
+            for keep_dim in [False, True]:
+                if np.isnan(data).all() and keep_dim:
+                    np_ver = np.version.version.split('.')
+                    if int(np_ver[0]) < 1 or int(np_ver[1]) <= 20:
+                        print(
+                            "This numpy version does not support all nan elements when keepdim is True"
+                        )
+                        continue
+
+                np_res = np.nanmedian(data, keepdims=keep_dim)
+                pd_res = paddle.nanmedian(
+                    paddle.to_tensor(data), keepdim=keep_dim)
+                self.assertTrue(
+                    np.allclose(
+                        np_res, pd_res.numpy(), equal_nan=True))
+
+        def test_axis_case(data, axis):
+            pd_res = paddle.nanmedian(
+                paddle.to_tensor(data), axis=axis, keepdim=False)
+            axis = clean_axis_numpy(axis, len(data.shape))
+            np_res = np.nanmedian(data, axis=axis, keepdims=False)
+            self.assertTrue(np.allclose(np_res, pd_res.numpy(), equal_nan=True))
+
+        for name, data in self.fake_data.items():
+            test_data_case(data)
+
+        for axis in self.axis_candiate_list:
+            test_axis_case(self.fake_data["row_nan_even"], axis)
+            test_axis_case(self.fake_data["col_nan_odd"], axis)
+
+        paddle.enable_static()
+
+    def test_errors(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.fluid.data("X", [10, 12])
+
+            def test_dtype():
+                x2 = paddle.fluid.data('X2', [10, 12], 'bool')
+                paddle.nanmedian(x2)
+
+            def test_empty_axis():
+                paddle.nanmedian(x, axis=[], keepdim=True)
+
+            def test_axis_not_in_range():
+                paddle.nanmedian(x, axis=3, keepdim=True)
+
+            def test_duplicated_axis():
+                paddle.nanmedian(x, axis=[1, -1], keepdim=True)
+
+            self.assertRaises(TypeError, test_dtype)
+            self.assertRaises(ValueError, test_empty_axis)
+            self.assertRaises(ValueError, test_axis_not_in_range)
+            self.assertRaises(ValueError, test_duplicated_axis)
+
+    def test_dygraph(self):
+        paddle.disable_static(place=self.place)
+        with paddle.fluid.dygraph.guard():
+            data = self.fake_data["col_nan_odd"]
+            out = paddle.nanmedian(paddle.to_tensor(data), keepdim=True)
+        np_res = np.nanmedian(data, keepdims=True)
+        self.assertTrue(np.allclose(np_res, out, equal_nan=True))
+        paddle.enable_static()
+
+    def test_check_grad(self):
+        paddle.disable_static(place=self.place)
+        shape = (4, 5)
+        x_np = np.random.uniform(-1, 1, shape).astype(np.float64)
+        x_np[0, :] = np.nan
+        x_np[1, :3] = np.nan
+        x_np[2, 3:] = np.nan
+        x_np_sorted = np.sort(x_np)
+        nan_counts = np.count_nonzero(np.isnan(x_np).astype(np.int32), axis=1)
+        np_grad = np.zeros((shape))
+        for i in range(shape[0]):
+            valid_cnts = shape[1] - nan_counts[i]
+            if valid_cnts == 0:
+                continue
+
+            mid = int(valid_cnts / 2)
+            targets = [x_np_sorted[i, mid]]
+            is_odd = valid_cnts % 2
+            if not is_odd and mid > 0:
+                targets.append(x_np_sorted[i, mid - 1])
+            for j in range(shape[1]):
+                if x_np[i, j] in targets:
+                    np_grad[i, j] = 1 if is_odd else 0.5
+
+        x_tensor = paddle.to_tensor(x_np, stop_gradient=False)
+        y = paddle.nanmedian(x_tensor, axis=1, keepdim=True)
+        dx = paddle.grad(y, x_tensor)[0].numpy()
+        self.assertTrue(np.allclose(np_grad, dx, equal_nan=True))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
index 1452b869d4f8b..13c2edbf37cf7 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
@@ -70,6 +70,72 @@ def func(self, place):
                 [x], z, x_init=x_arr, atol=atol, place=place, eps=eps)
 
 
+class TestInstanceNormDoubleGradEagerCheck(unittest.TestCase):
+    def instance_norm_wrapper(self, x):
+        return paddle.nn.functional.instance_norm(x[0])
+
+    @prog_scope()
+    def func(self, place):
+        prog = fluid.Program()
+        with fluid.program_guard(prog):
+            np.random.seed()
+            shape = [2, 3, 4, 5]
+            dtype = "float32"
+            eps = 0.005
+            atol = 1e-4
+            x = layers.create_parameter(dtype=dtype, shape=shape, name='x')
+            z = paddle.nn.functional.instance_norm(x)
+            x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
+            # check for static mode
+            gradient_checker.double_grad_check(
+                [x], z, x_init=x_arr, atol=atol, place=place, eps=eps)
+            # check for eager mode
+            gradient_checker.double_grad_check_for_dygraph(
+                self.instance_norm_wrapper, [x],
+                z,
+                x_init=x_arr,
+                atol=atol,
+                place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestInstanceNormDoubleGradEagerCheckWithParams(
+        TestInstanceNormDoubleGradEagerCheck):
+    def instance_norm_wrapper(self, x):
+        instance_norm = paddle.nn.InstanceNorm2D(3)
+        return instance_norm(x[0])
+
+    @prog_scope()
+    def func(self, place):
+        prog = fluid.Program()
+        with fluid.program_guard(prog):
+            np.random.seed()
+            shape = [2, 3, 4, 5]
+            dtype = "float32"
+            eps = 0.005
+            atol = 1e-4
+            x = layers.create_parameter(dtype=dtype, shape=shape, name='x')
+            z = paddle.nn.InstanceNorm2D(3)(x)
+            x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
+            # check for static mode
+            gradient_checker.double_grad_check(
+                [x], z, x_init=x_arr, atol=atol, place=place, eps=eps)
+            # check for eager mode
+            gradient_checker.double_grad_check_for_dygraph(
+                self.instance_norm_wrapper, [x],
+                z,
+                x_init=x_arr,
+                atol=atol,
+                place=place)
+
+
 class TestBatchNormDoubleGradCheck(unittest.TestCase):
     def setUp(self):
         self.init_test()
diff --git a/python/paddle/fluid/tests/unittests/test_rrelu_op.py b/python/paddle/fluid/tests/unittests/test_rrelu_op.py
new file mode 100644
index 0000000000000..9d33ce085b7f7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_rrelu_op.py
@@ -0,0 +1,326 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from op_test import OpTest
+import paddle
+import paddle.nn.functional as F
+from paddle.fluid import dygraph
+
+paddle.seed(102)
+np.random.seed(102)
+
+
+def ref_rrelu(x, lower, upper):
+    x_t = x.copy()
+    alpha = (lower + upper) / 2.0
+    return np.where(x_t <= 0, alpha * x_t, x_t)
+
+
+def ref_rrelu_nn(x, lower, upper):
+    return ref_rrelu(x, lower, upper)
+
+
+def check_output(input, output, lower, upper):
+    lower_res = np.where(input <= 0, lower * input, input)
+    upper_res = np.where(input <= 0, upper * input, input)
+    return (output <= lower_res).all() and (output >= upper_res).all()
+
+
+class TestFunctionalRReluAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = np.random.uniform(-1., 1., [1, 2, 3, 4]).astype('float64')
+        self.lower_0 = 0.05
+        self.lower_1 = 0.1
+        self.upper_0 = 0.25
+        self.upper_1 = 0.33
+
+        self.places = [
+            fluid.CUDAPlace(0)
+            if core.is_compiled_with_cuda() else fluid.CPUPlace()
+        ]
+
+    def check_static_result(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(
+                name="input", shape=[2, 3, 4, 5], dtype="float32")
+            res1 = F.rrelu(
+                x=input, lower=self.lower_0, upper=self.upper_0, training=False)
+            res2 = F.rrelu(
+                x=input, lower=self.lower_1, upper=self.upper_1, training=False)
+            in_np = np.random.uniform(-1., 1., [2, 3, 4, 5]).astype("float32")
+
+            res_np1 = ref_rrelu(in_np, self.lower_0, self.upper_0)
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": in_np},
+                              fetch_list=[res1])
+
+            self.assertTrue(np.allclose(fetches[0], res_np1))
+
+            res_np2 = ref_rrelu(in_np, self.lower_1, self.upper_1)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": in_np},
+                              fetch_list=[res2])
+            self.assertTrue(np.allclose(fetches[0], res_np2))
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_static_graph_functional(self):
+        '''test_static_graph_functional'''
+
+        for place in self.places:
+            paddle.enable_static()
+            x_1 = paddle.fluid.data(
+                name="x", shape=self.x_np.shape, dtype="float64")
+            x_2 = paddle.fluid.data(
+                name="x2", shape=self.x_np.shape, dtype="float64")
+            out_1 = F.rrelu(x_1, self.lower_0, self.upper_0, training=False)
+            out_2 = F.rrelu(x_2, self.lower_1, self.upper_1, training=False)
+            out_3 = F.rrelu(x_2, self.lower_1, self.upper_1, training=True)
+
+            exe = paddle.static.Executor(place=place)
+            res_1 = exe.run(fluid.default_main_program(),
+                            feed={"x": self.x_np},
+                            fetch_list=out_1,
+                            use_prune=True)
+            res_2 = exe.run(fluid.default_main_program(),
+                            feed={"x2": self.x_np},
+                            fetch_list=out_2,
+                            use_prune=True)
+            res_3 = exe.run(fluid.default_main_program(),
+                            feed={"x2": self.x_np},
+                            fetch_list=out_3,
+                            use_prune=True)
+
+            out_ref_1 = ref_rrelu(self.x_np, self.lower_0, self.upper_0)
+            out_ref_2 = ref_rrelu(self.x_np, self.lower_1, self.upper_1)
+            self.assertEqual(np.allclose(out_ref_1, res_1), True)
+            self.assertEqual(np.allclose(out_ref_2, res_2), True)
+            self.assertTrue(
+                check_output(self.x_np, res_3[0], self.lower_1, self.upper_1))
+
+    def test_static_graph_layer(self):
+        '''test_static_graph_layer'''
+
+        for place in self.places:
+            paddle.enable_static()
+            x_1 = paddle.fluid.data(
+                name="x", shape=self.x_np.shape, dtype="float64")
+            x_2 = paddle.fluid.data(
+                name="x2", shape=self.x_np.shape, dtype="float64")
+            # init instance
+            rrelu_1 = paddle.nn.RReLU(self.lower_0, self.upper_0)
+            rrelu_2 = paddle.nn.RReLU(self.lower_1, self.upper_1)
+            out_1 = rrelu_1(x_1)
+            out_2 = rrelu_2(x_2)
+
+            exe = paddle.static.Executor(place=place)
+            res_1 = exe.run(fluid.default_main_program(),
+                            feed={"x": self.x_np},
+                            fetch_list=out_1,
+                            use_prune=True)
+            res_2 = exe.run(fluid.default_main_program(),
+                            feed={"x2": self.x_np},
+                            fetch_list=out_2,
+                            use_prune=True)
+
+            self.assertTrue(
+                check_output(self.x_np, res_1[0], self.lower_0, self.upper_0))
+            self.assertTrue(
+                check_output(self.x_np, res_2[0], self.lower_1, self.upper_1))
+
+    def dygraph_check(self, lower, upper):
+        for place in self.places:
+            paddle.disable_static(place)
+            x = paddle.to_tensor(self.x_np)
+            out = F.rrelu(x, lower, upper, training=False)
+            out_ref = ref_rrelu(self.x_np, lower, upper)
+            self.assertEqual(np.allclose(out_ref, out), True)
+            paddle.enable_static()
+
+    def test_dygraph_functional(self):
+        '''test_dygraph_functional'''
+
+        self.dygraph_check(self.lower_0, self.upper_0)
+        self.dygraph_check(self.lower_1, self.upper_1)
+
+    def test_dygraph_layer(self):
+        '''test_dygraph_layer'''
+
+        for place in self.places:
+            paddle.disable_static(place=place)
+            rrelu = paddle.nn.RReLU(self.lower_0, self.upper_0)
+            result = rrelu(paddle.to_tensor(self.x_np))
+            self.assertTrue(
+                check_output(self.x_np,
+                             result.numpy(), self.lower_0, self.upper_0))
+            paddle.enable_static()
+
+    def test_dygraph(self):
+        for place in self.places:
+            paddle.disable_static(place=place)
+            with dygraph.guard():
+                rrelu = paddle.nn.RReLU(self.lower_0, self.upper_0)
+                out_np = rrelu(paddle.to_tensor(self.x_np))
+            self.assertTrue(
+                check_output(self.x_np,
+                             out_np.numpy(), self.lower_0, self.upper_0))
+            paddle.enable_static()
+
+    def test_error_functional(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            # The input type must be Variable.
+            self.assertRaises(
+                TypeError, F.rrelu, x=1, lower=self.lower_0, upper=self.upper_0)
+            # The input dtype must be float16, float32, float64.
+            x_int32 = paddle.fluid.data(
+                name='x_int32', shape=[2, 3], dtype='int32')
+            self.assertRaises(
+                TypeError,
+                F.rrelu,
+                x=x_int32,
+                lower=self.lower_0,
+                upper=self.upper_0)
+            x_bool = paddle.fluid.data(
+                name='x_bool', shape=[2, 3], dtype='int32')
+            self.assertRaises(
+                TypeError,
+                F.rrelu,
+                x=x_bool,
+                lower=self.lower_0,
+                upper=self.upper_0)
+            # lower and upper must be float
+            x_fp32 = paddle.fluid.data(
+                name='x_fp32', shape=[2, 3], dtype='float32')
+            self.assertRaises(TypeError, F.rrelu, x=x_fp32, lower=0, upper=0.5)
+            self.assertRaises(TypeError, F.rrelu, x=x_fp32, lower=0.5, upper=1)
+            # lower and upper must be in (0, 1)
+            self.assertRaises(
+                ValueError, F.rrelu, x=x_fp32, lower=-1., upper=0.5)
+            self.assertRaises(
+                ValueError, F.rrelu, x=x_fp32, lower=0.5, upper=2.)
+            # upper should not be less than lower
+            self.assertRaises(
+                ValueError, F.rrelu, x=x_fp32, lower=0.5, upper=0.2)
+            # support the input dtype is float16
+            x_fp16 = paddle.fluid.data(
+                name='x_fp16', shape=[2, 3], dtype='float16')
+            F.rrelu(x=x_fp16, lower=self.lower_0, upper=self.upper_0)
+
+    def test_error_layer(self):
+        def error_int_dtype():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 3]).astype("float64")
+                rrelu = paddle.nn.RReLU(2, 3)
+                rrelu(paddle.to_tensor(x))
+
+        def error_lower_dtype():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 3]).astype("float32")
+                rrelu = paddle.nn.RReLU(0, 0.5)
+                rrelu(paddle.to_tensor(x))
+
+        def error_upper_dtype():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 3]).astype("float32")
+                rrelu = paddle.nn.RReLU(0.5, 1)
+                rrelu(paddle.to_tensor(x))
+
+        def error_lower_range():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 3]).astype("float32")
+                rrelu = paddle.nn.RReLU(-1.0, 0.5)
+                rrelu(paddle.to_tensor(x))
+
+        def error_upper_range():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 3]).astype("float32")
+                rrelu = paddle.nn.RReLU(0.5, 2.0)
+                rrelu(paddle.to_tensor(x))
+
+        def error_lower_upper():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 3]).astype("float32")
+                rrelu = paddle.nn.RReLU(0.5, 0.2)
+                rrelu(paddle.to_tensor(x))
+
+        self.assertRaises(TypeError, error_int_dtype)
+        self.assertRaises(TypeError, error_lower_dtype)
+        self.assertRaises(TypeError, error_upper_dtype)
+        self.assertRaises(ValueError, error_lower_range)
+        self.assertRaises(ValueError, error_upper_range)
+        self.assertRaises(ValueError, error_lower_upper)
+
+
+class RReluTest(OpTest):
+    def setUp(self):
+        self.op_type = "rrelu"
+        self.lower = 0.1
+        self.upper = 0.3
+        self.is_test = True
+        self.init_prams()
+
+    def init_prams(self):
+        self.dtype = "float64"
+        self.x_shape = [2, 3, 4, 5]
+
+        x_np = np.random.uniform(-1, 1, self.x_shape).astype(self.dtype)
+        out_np = ref_rrelu(x_np, self.lower, self.upper)
+        noise_np = np.ones(self.x_shape).astype(self.dtype)
+        noise_np[x_np < 0] = (self.lower + self.upper) / 2.0
+
+        self.inputs = {'X': x_np}
+        self.outputs = {'Out': out_np, 'Noise': noise_np}
+        self.attrs = {
+            'lower': self.lower,
+            "upper": self.upper,
+            "is_test": self.is_test
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class RReluTrainingTest(OpTest):
+    def setUp(self):
+        self.op_type = "rrelu"
+        self.lower = 0.3
+        self.upper = 0.3000009
+        self.is_test = False
+        self.init_prams()
+
+
+class RReluTrainingTest(OpTest):
+    def setUp(self):
+        self.op_type = "rrelu"
+        self.lower = 0.3
+        self.upper = 0.3000009
+        self.is_test = False
+        self.init_prams()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py b/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py
index 1677051ee9db4..5634490aa3e75 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py
@@ -41,7 +41,7 @@ def test_conv3d(self):
             correct_out_values = [[5], [11]]
             sparse_input = core.eager.sparse_coo_tensor(indices, values,
                                                         dense_shape, False)
-            out = paddle.sparse.functional.conv3d(
+            out = paddle.incubate.sparse.nn.functional.conv3d(
                 sparse_input,
                 dense_kernel,
                 bias=paddle.to_tensor(
@@ -61,10 +61,11 @@ def test_subm_conv3d(self):
             indices = paddle.to_tensor(indices, dtype='int32')
             values = paddle.to_tensor(values, dtype='float32')
             dense_shape = [1, 1, 3, 4, 1]
-            sparse_x = paddle.sparse.sparse_coo_tensor(
+            sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
                 indices, values, dense_shape, stop_gradient=True)
             weight = paddle.randn((1, 3, 3, 1, 1), dtype='float32')
-            y = paddle.sparse.functional.subm_conv3d(sparse_x, weight)
+            y = paddle.incubate.sparse.nn.functional.subm_conv3d(sparse_x,
+                                                                 weight)
             assert np.array_equal(sparse_x.indices().numpy(),
                                   y.indices().numpy())
 
@@ -78,16 +79,16 @@ def test_Conv3D(self):
             values = paddle.to_tensor(values, dtype='float32')
             dense_shape = [1, 1, 3, 4, 1]
             correct_out_values = [[4], [10]]
-            sparse_input = paddle.sparse.sparse_coo_tensor(indices, values,
-                                                           dense_shape, False)
+            sparse_input = paddle.incubate.sparse.sparse_coo_tensor(
+                indices, values, dense_shape, False)
 
-            sparse_conv3d = paddle.sparse.Conv3D(
+            sparse_conv3d = paddle.incubate.sparse.nn.Conv3D(
                 1, 1, (1, 3, 3), data_format='NDHWC')
             sparse_out = sparse_conv3d(sparse_input)
             #test errors
             with self.assertRaises(ValueError):
                 #Currently, only support data_format='NDHWC'
-                conv3d = paddle.sparse.SubmConv3D(
+                conv3d = paddle.incubate.sparse.nn.SubmConv3D(
                     1, 1, (1, 3, 3), data_format='NCDHW')
 
     def test_SubmConv3D(self):
@@ -98,10 +99,10 @@ def test_SubmConv3D(self):
             values = paddle.to_tensor(values, dtype='float32')
             dense_shape = [1, 1, 3, 4, 1]
             correct_out_values = [[4], [10]]
-            sparse_input = paddle.sparse.sparse_coo_tensor(indices, values,
-                                                           dense_shape, False)
+            sparse_input = paddle.incubate.sparse.sparse_coo_tensor(
+                indices, values, dense_shape, False)
 
-            subm_conv3d = paddle.sparse.SubmConv3D(
+            subm_conv3d = paddle.incubate.sparse.nn.SubmConv3D(
                 1, 1, (1, 3, 3), data_format='NDHWC')
             # test extra_repr
             print(subm_conv3d.extra_repr())
@@ -113,5 +114,5 @@ def test_SubmConv3D(self):
             #test errors
             with self.assertRaises(ValueError):
                 #Currently, only support data_format='NDHWC'
-                conv3d = paddle.sparse.SubmConv3D(
+                conv3d = paddle.incubate.sparse.nn.SubmConv3D(
                     1, 1, (1, 3, 3), data_format='NCDHW')
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_norm_op.py b/python/paddle/fluid/tests/unittests/test_sparse_norm_op.py
index 3c3085ec8be69..f22d48ae92b0d 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_norm_op.py
@@ -38,7 +38,7 @@ def test(self):
             dense_x2 = copy.deepcopy(dense_x)
             dense_x2.stop_gradient = False
             sparse_x = dense_x2.to_sparse_coo(sparse_dim)
-            sparse_batch_norm = paddle.sparse.BatchNorm(channels)
+            sparse_batch_norm = paddle.incubate.sparse.nn.BatchNorm(channels)
             # set same params
             sparse_batch_norm._mean.set_value(batch_norm._mean)
             sparse_batch_norm._variance.set_value(batch_norm._variance)
@@ -66,7 +66,7 @@ def test_error_layout(self):
                 shape = [2, 3, 6, 6, 3]
                 x = paddle.randn(shape)
                 sparse_x = x.to_sparse_coo(4)
-                sparse_batch_norm = paddle.sparse.BatchNorm(
+                sparse_batch_norm = paddle.incubate.sparse.nn.BatchNorm(
                     3, data_format='NCDHW')
                 sparse_batch_norm(sparse_x)
 
@@ -77,7 +77,7 @@ def test2(self):
             x_data = paddle.randn((1, 6, 6, 6, channels)).astype('float32')
             dense_x = paddle.to_tensor(x_data)
             sparse_x = dense_x.to_sparse_coo(4)
-            batch_norm = paddle.sparse.BatchNorm(channels)
+            batch_norm = paddle.incubate.sparse.nn.BatchNorm(channels)
             batch_norm_out = batch_norm(sparse_x)
             print(batch_norm_out.shape)
             # [1, 6, 6, 6, 3]
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_pooling_op.py b/python/paddle/fluid/tests/unittests/test_sparse_pooling_op.py
index 8d65a4c4444d4..c0a43b3dad3b0 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_pooling_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_pooling_op.py
@@ -47,7 +47,7 @@ def test(self):
             self.setUp()
             self.dense_x.stop_gradient = False
             sparse_x = self.dense_x.to_sparse_coo(4)
-            sparse_out = paddle.sparse.functional.max_pool3d(
+            sparse_out = paddle.incubate.sparse.nn.functional.max_pool3d(
                 sparse_x,
                 self.kernel_sizes,
                 stride=self.strides,
@@ -104,7 +104,7 @@ def test(self):
         with _test_eager_guard():
             dense_x = paddle.randn((2, 3, 6, 6, 3))
             sparse_x = dense_x.to_sparse_coo(4)
-            max_pool3d = paddle.sparse.MaxPool3D(
+            max_pool3d = paddle.incubate.sparse.nn.MaxPool3D(
                 kernel_size=3, data_format='NDHWC')
             out = max_pool3d(sparse_x)
             out = out.to_dense()
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_unary_op.py b/python/paddle/fluid/tests/unittests/test_sparse_unary_op.py
index 573cc5ba8cf5d..f3337bce9117c 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_unary_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_unary_op.py
@@ -67,15 +67,15 @@ def test_sparse_relu(self):
             x,
             lambda x: x.to_sparse_coo(sparse_dim),
             paddle.nn.ReLU(),
-            paddle.sparse.ReLU(),
+            paddle.incubate.sparse.nn.ReLU(),
             True, )
         self.compare_with_dense(
             x,
             lambda x: x.to_sparse_csr(),
             paddle.nn.ReLU(),
-            paddle.sparse.ReLU(),
+            paddle.incubate.sparse.nn.ReLU(),
             False, )
-        self.assert_raises_on_dense_tensor(paddle.sparse.ReLU())
+        self.assert_raises_on_dense_tensor(paddle.incubate.sparse.nn.ReLU())
 
     def test_sparse_sqrt(self):
         x = [[0, 16, 0, 0], [0, 0, 0, 0], [0, 4, 2, 0]]
@@ -84,15 +84,15 @@ def test_sparse_sqrt(self):
             x,
             lambda x: x.to_sparse_coo(sparse_dim),
             paddle.sqrt,
-            paddle.sparse.sqrt,
+            paddle.incubate.sparse.sqrt,
             True, )
         self.compare_with_dense(
             x,
             lambda x: x.to_sparse_csr(),
             paddle.sqrt,
-            paddle.sparse.sqrt,
+            paddle.incubate.sparse.sqrt,
             False, )
-        self.assert_raises_on_dense_tensor(paddle.sparse.sqrt)
+        self.assert_raises_on_dense_tensor(paddle.incubate.sparse.sqrt)
 
     def test_sparse_sin(self):
         x = [[0, 16, 0, 0], [0, 0, 0, 0], [0, 4, 2, 0]]
@@ -101,15 +101,15 @@ def test_sparse_sin(self):
             x,
             lambda x: x.to_sparse_coo(sparse_dim),
             paddle.sin,
-            paddle.sparse.sin,
+            paddle.incubate.sparse.sin,
             True, )
         self.compare_with_dense(
             x,
             lambda x: x.to_sparse_csr(),
             paddle.sin,
-            paddle.sparse.sin,
+            paddle.incubate.sparse.sin,
             False, )
-        self.assert_raises_on_dense_tensor(paddle.sparse.sin)
+        self.assert_raises_on_dense_tensor(paddle.incubate.sparse.sin)
 
     def test_sparse_tanh(self):
         x = [[0, 16, 0, 0], [0, 0, 0, 0], [0, -4, 2, 0]]
@@ -118,15 +118,15 @@ def test_sparse_tanh(self):
             x,
             lambda x: x.to_sparse_coo(sparse_dim),
             paddle.tanh,
-            paddle.sparse.tanh,
+            paddle.incubate.sparse.tanh,
             True, )
         self.compare_with_dense(
             x,
             lambda x: x.to_sparse_csr(),
             paddle.tanh,
-            paddle.sparse.tanh,
+            paddle.incubate.sparse.tanh,
             False, )
-        self.assert_raises_on_dense_tensor(paddle.sparse.tanh)
+        self.assert_raises_on_dense_tensor(paddle.incubate.sparse.tanh)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
index 80820c0f2d837..5a5af059951e0 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
@@ -30,7 +30,7 @@ def test_create_coo_by_tensor(self):
             dense_shape = [3, 4]
             dense_indices = paddle.to_tensor(indices)
             dense_elements = paddle.to_tensor(values, dtype='float32')
-            coo = paddle.sparse.sparse_coo_tensor(
+            coo = paddle.incubate.sparse.sparse_coo_tensor(
                 dense_indices, dense_elements, dense_shape, stop_gradient=False)
             # test the to_string.py
             print(coo)
@@ -42,7 +42,8 @@ def test_create_coo_by_np(self):
             indices = [[0, 1, 2], [1, 2, 0]]
             values = [1.0, 2.0, 3.0]
             dense_shape = [3, 3]
-            coo = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape)
+            coo = paddle.incubate.sparse.sparse_coo_tensor(indices, values,
+                                                           dense_shape)
             assert np.array_equal(indices, coo.indices().numpy())
             assert np.array_equal(values, coo.values().numpy())
 
@@ -56,7 +57,7 @@ def test_create_csr_by_tensor(self):
             dense_cols = paddle.to_tensor(cols)
             dense_elements = paddle.to_tensor(values, dtype='float32')
             stop_gradient = False
-            csr = paddle.sparse.sparse_csr_tensor(
+            csr = paddle.incubate.sparse.sparse_csr_tensor(
                 dense_crows,
                 dense_cols,
                 dense_elements,
@@ -69,8 +70,8 @@ def test_create_csr_by_np(self):
             cols = [1, 3, 2, 0, 1]
             values = [1, 2, 3, 4, 5]
             dense_shape = [3, 4]
-            csr = paddle.sparse.sparse_csr_tensor(crows, cols, values,
-                                                  dense_shape)
+            csr = paddle.incubate.sparse.sparse_csr_tensor(crows, cols, values,
+                                                           dense_shape)
             # test the to_string.py
             print(csr)
             assert np.array_equal(crows, csr.crows().numpy())
@@ -83,7 +84,7 @@ def test_place(self):
             indices = [[0, 1], [0, 1]]
             values = [1.0, 2.0]
             dense_shape = [2, 2]
-            coo = paddle.sparse.sparse_coo_tensor(
+            coo = paddle.incubate.sparse.sparse_coo_tensor(
                 indices, values, dense_shape, place=place)
             assert coo.place.is_cpu_place()
             assert coo.values().place.is_cpu_place()
@@ -92,7 +93,7 @@ def test_place(self):
             crows = [0, 2, 3, 5]
             cols = [1, 3, 2, 0, 1]
             values = [1.0, 2.0, 3.0, 4.0, 5.0]
-            csr = paddle.sparse.sparse_csr_tensor(
+            csr = paddle.incubate.sparse.sparse_csr_tensor(
                 crows, cols, values, [3, 5], place=place)
             assert csr.place.is_cpu_place()
             assert csr.crows().place.is_cpu_place()
@@ -106,14 +107,14 @@ def test_dtype(self):
             dense_shape = [2, 2]
             indices = paddle.to_tensor(indices, dtype='int32')
             values = paddle.to_tensor(values, dtype='float32')
-            coo = paddle.sparse.sparse_coo_tensor(
+            coo = paddle.incubate.sparse.sparse_coo_tensor(
                 indices, values, dense_shape, dtype='float64')
             assert coo.dtype == paddle.float64
 
             crows = [0, 2, 3, 5]
             cols = [1, 3, 2, 0, 1]
             values = [1.0, 2.0, 3.0, 4.0, 5.0]
-            csr = paddle.sparse.sparse_csr_tensor(
+            csr = paddle.incubate.sparse.sparse_csr_tensor(
                 crows, cols, values, [3, 5], dtype='float16')
             assert csr.dtype == paddle.float16
 
@@ -123,7 +124,7 @@ def test_create_coo_no_shape(self):
             values = [1.0, 2.0]
             indices = paddle.to_tensor(indices, dtype='int32')
             values = paddle.to_tensor(values, dtype='float32')
-            coo = paddle.sparse.sparse_coo_tensor(indices, values)
+            coo = paddle.incubate.sparse.sparse_coo_tensor(indices, values)
             assert [2, 2] == coo.shape
 
 
@@ -140,7 +141,7 @@ def test_to_sparse_coo(self):
             #test to_sparse_coo_grad backward
             out_grad_indices = [[0, 1], [0, 1]]
             out_grad_values = [2.0, 3.0]
-            out_grad = paddle.sparse.sparse_coo_tensor(
+            out_grad = paddle.incubate.sparse.sparse_coo_tensor(
                 paddle.to_tensor(out_grad_indices),
                 paddle.to_tensor(out_grad_values),
                 shape=out.shape,
@@ -153,7 +154,7 @@ def test_coo_to_dense(self):
         with _test_eager_guard():
             indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]]
             values = [1.0, 2.0, 3.0, 4.0, 5.0]
-            sparse_x = paddle.sparse.sparse_coo_tensor(
+            sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
                 paddle.to_tensor(indices),
                 paddle.to_tensor(values),
                 shape=[3, 4],
@@ -169,7 +170,7 @@ def test_coo_to_dense(self):
                                   sparse_x.grad.values().numpy())
 
             paddle.device.set_device("cpu")
-            sparse_x_cpu = paddle.sparse.sparse_coo_tensor(
+            sparse_x_cpu = paddle.incubate.sparse.sparse_coo_tensor(
                 paddle.to_tensor(indices),
                 paddle.to_tensor(values),
                 shape=[3, 4],
@@ -198,7 +199,7 @@ def test_coo_values_grad(self):
         with _test_eager_guard():
             indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]]
             values = [1.0, 2.0, 3.0, 4.0, 5.0]
-            sparse_x = paddle.sparse.sparse_coo_tensor(
+            sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
                 paddle.to_tensor(indices),
                 paddle.to_tensor(values),
                 shape=[3, 4],
@@ -211,7 +212,7 @@ def test_coo_values_grad(self):
             indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]]
             values = [[1.0, 1.0], [2.0, 2.0], [3.0, 3.0], [4.0, 4.0],
                       [5.0, 5.0]]
-            sparse_x = paddle.sparse.sparse_coo_tensor(
+            sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
                 paddle.to_tensor(indices),
                 paddle.to_tensor(values),
                 shape=[3, 4, 2],
@@ -234,13 +235,13 @@ def test_sparse_coo_tensor_grad(self):
                     indices = paddle.to_tensor(indices, dtype='int32')
                     values = paddle.to_tensor(
                         values, dtype='float32', stop_gradient=False)
-                    sparse_x = paddle.sparse.sparse_coo_tensor(
+                    sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
                         indices, values, shape=[2, 2], stop_gradient=False)
                     grad_indices = [[0, 1], [1, 1]]
                     grad_values = [2, 3]
                     grad_indices = paddle.to_tensor(grad_indices, dtype='int32')
                     grad_values = paddle.to_tensor(grad_values, dtype='float32')
-                    sparse_out_grad = paddle.sparse.sparse_coo_tensor(
+                    sparse_out_grad = paddle.incubate.sparse.sparse_coo_tensor(
                         grad_indices, grad_values, shape=[2, 2])
                     sparse_x.backward(sparse_out_grad)
                     correct_values_grad = [0, 3]
@@ -251,11 +252,11 @@ def test_sparse_coo_tensor_grad(self):
                     values = [[1, 1], [2, 2]]
                     values = paddle.to_tensor(
                         values, dtype='float32', stop_gradient=False)
-                    sparse_x = paddle.sparse.sparse_coo_tensor(
+                    sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
                         indices, values, shape=[2, 2, 2], stop_gradient=False)
                     grad_values = [[2, 2], [3, 3]]
                     grad_values = paddle.to_tensor(grad_values, dtype='float32')
-                    sparse_out_grad = paddle.sparse.sparse_coo_tensor(
+                    sparse_out_grad = paddle.incubate.sparse.sparse_coo_tensor(
                         grad_indices, grad_values, shape=[2, 2, 2])
                     sparse_x.backward(sparse_out_grad)
                     correct_values_grad = [[0, 0], [3, 3]]
@@ -273,7 +274,8 @@ def test_sparse_coo_tensor_sorted(self):
                     values = [1.0, 2.0, 3.0]
                     indices = paddle.to_tensor(indices, dtype='int32')
                     values = paddle.to_tensor(values, dtype='float32')
-                    sparse_x = paddle.sparse.sparse_coo_tensor(indices, values)
+                    sparse_x = paddle.incubate.sparse.sparse_coo_tensor(indices,
+                                                                        values)
                     indices_sorted = [[0, 1], [1, 0]]
                     values_sorted = [5.0, 1.0]
                     assert np.array_equal(indices_sorted,
@@ -284,7 +286,8 @@ def test_sparse_coo_tensor_sorted(self):
                     # test the non-zero values is a vector
                     values = [[1.0, 1.0], [2.0, 2.0], [3.0, 3.0]]
                     values = paddle.to_tensor(values, dtype='float32')
-                    sparse_x = paddle.sparse.sparse_coo_tensor(indices, values)
+                    sparse_x = paddle.incubate.sparse.sparse_coo_tensor(indices,
+                                                                        values)
                     values_sorted = [[5.0, 5.0], [1.0, 1.0]]
                     assert np.array_equal(indices_sorted,
                                           sparse_x.indices().numpy())
@@ -300,7 +303,7 @@ def test_small_shape(self):
                 values = [1, 2]
                 # 1. the shape too small
                 dense_shape = [2, 2]
-                sparse_x = paddle.sparse.sparse_coo_tensor(
+                sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
                     indices, values, shape=dense_shape)
 
     def test_same_nnz(self):
@@ -309,7 +312,8 @@ def test_same_nnz(self):
                 # 2. test the nnz of indices must same as nnz of values
                 indices = [[1, 2], [1, 0]]
                 values = [1, 2, 3]
-                sparse_x = paddle.sparse.sparse_coo_tensor(indices, values)
+                sparse_x = paddle.incubate.sparse.sparse_coo_tensor(indices,
+                                                                    values)
 
     def test_same_dimensions(self):
         with _test_eager_guard():
@@ -317,7 +321,7 @@ def test_same_dimensions(self):
                 indices = [[1, 2], [1, 0]]
                 values = [1, 2, 3]
                 shape = [2, 3, 4]
-                sparse_x = paddle.sparse.sparse_coo_tensor(
+                sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
                     indices, values, shape=shape)
 
     def test_indices_dtype(self):
@@ -325,7 +329,8 @@ def test_indices_dtype(self):
             with self.assertRaises(TypeError):
                 indices = [[1.0, 2.0], [0, 1]]
                 values = [1, 2]
-                sparse_x = paddle.sparse.sparse_coo_tensor(indices, values)
+                sparse_x = paddle.incubate.sparse.sparse_coo_tensor(indices,
+                                                                    values)
 
 
 class TestCsrError(unittest.TestCase):
@@ -336,8 +341,8 @@ def test_dimension1(self):
                 cols = [0, 1, 2]
                 values = [1, 2, 3]
                 shape = [3]
-                sparse_x = paddle.sparse.sparse_csr_tensor(crows, cols, values,
-                                                           shape)
+                sparse_x = paddle.incubate.sparse.sparse_csr_tensor(
+                    crows, cols, values, shape)
 
     def test_dimension2(self):
         with _test_eager_guard():
@@ -346,8 +351,8 @@ def test_dimension2(self):
                 cols = [0, 1, 2]
                 values = [1, 2, 3]
                 shape = [3, 3, 3, 3]
-                sparse_x = paddle.sparse.sparse_csr_tensor(crows, cols, values,
-                                                           shape)
+                sparse_x = paddle.incubate.sparse.sparse_csr_tensor(
+                    crows, cols, values, shape)
 
     def test_same_shape1(self):
         with _test_eager_guard():
@@ -356,8 +361,8 @@ def test_same_shape1(self):
                 cols = [0, 1, 2, 3]
                 values = [1, 2, 3]
                 shape = [3, 4]
-                sparse_x = paddle.sparse.sparse_csr_tensor(crows, cols, values,
-                                                           shape)
+                sparse_x = paddle.incubate.sparse.sparse_csr_tensor(
+                    crows, cols, values, shape)
 
     def test_same_shape2(self):
         with _test_eager_guard():
@@ -366,8 +371,8 @@ def test_same_shape2(self):
                 cols = [0, 1, 2, 3]
                 values = [1, 2, 3, 4]
                 shape = [3, 4]
-                sparse_x = paddle.sparse.sparse_csr_tensor(crows, cols, values,
-                                                           shape)
+                sparse_x = paddle.incubate.sparse.sparse_csr_tensor(
+                    crows, cols, values, shape)
 
     def test_same_shape3(self):
         with _test_eager_guard():
@@ -376,8 +381,8 @@ def test_same_shape3(self):
                 cols = [0, 1, 2, 3, 0, 1, 2]
                 values = [1, 2, 3, 4, 0, 1, 2]
                 shape = [2, 3, 4]
-                sparse_x = paddle.sparse.sparse_csr_tensor(crows, cols, values,
-                                                           shape)
+                sparse_x = paddle.incubate.sparse.sparse_csr_tensor(
+                    crows, cols, values, shape)
 
     def test_crows_first_value(self):
         with _test_eager_guard():
@@ -386,8 +391,8 @@ def test_crows_first_value(self):
                 cols = [0, 1, 2]
                 values = [1, 2, 3]
                 shape = [3, 4]
-                sparse_x = paddle.sparse.sparse_csr_tensor(crows, cols, values,
-                                                           shape)
+                sparse_x = paddle.incubate.sparse.sparse_csr_tensor(
+                    crows, cols, values, shape)
 
     def test_dtype(self):
         with _test_eager_guard():
@@ -396,8 +401,8 @@ def test_dtype(self):
                 cols = [0, 1, 2]
                 values = [1, 2, 3]
                 shape = [3]
-                sparse_x = paddle.sparse.sparse_csr_tensor(crows, cols, values,
-                                                           shape)
+                sparse_x = paddle.incubate.sparse.sparse_csr_tensor(
+                    crows, cols, values, shape)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index ffd1607fe87b4..a3584a73dfae1 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -55,5 +55,6 @@
 
 from ..fluid.layer_helper import LayerHelper  # noqa: F401
 from ..fluid.framework import in_dygraph_mode  # noqa: F401
+from ..fluid.framework import _in_legacy_dygraph  # noqa: F401
 
 __all__ = []
diff --git a/python/paddle/incubate/__init__.py b/python/paddle/incubate/__init__.py
index c354baf3b43b7..c030cf5bbb9ee 100644
--- a/python/paddle/incubate/__init__.py
+++ b/python/paddle/incubate/__init__.py
@@ -30,6 +30,7 @@
 from .passes import fuse_resnet_unit_pass
 import paddle.incubate.autograd
 import paddle.incubate.autotune
+import paddle.incubate.sparse
 
 from . import nn  #noqa: F401
 from . import asp  #noqa: F401
diff --git a/python/paddle/incubate/autograd/primx.py b/python/paddle/incubate/autograd/primx.py
index 7a969748208a4..1f5c4f9a5cebb 100644
--- a/python/paddle/incubate/autograd/primx.py
+++ b/python/paddle/incubate/autograd/primx.py
@@ -38,8 +38,7 @@ def topo_path(xs, ys, block=None):
         path, the unused variables in `xs`, and the unreached variables in `ys`
     """
 
-    if block is None:
-        block = default_main_program().current_block()
+    block = default_main_program().current_block() if block is None else block
 
     path = []
     backpath = []
@@ -160,11 +159,14 @@ def contain_value(self, value_var):
         return id(value_var) in self.tab.values()
 
 
+# TODO(lml): supporting control flow, nested blocks, and block other than current block of main program.
 class Transform(object):
     """ An object that maintains the state of transformations applied to a 
     primitve program. """
 
     def __init__(self, block):
+        assert block == default_main_program().current_block(
+        ), f'only support transform on current block of main program.'
         self.block = block
         self.vars = self.init_vars(block)
         self.var2dot = VarMap('var2dot', self.vars)
@@ -400,6 +402,7 @@ def transpose(self, ys_dot, xs_dot, ys_bar=None, retain_fwd=False):
         return ys_bar, xs_bar
 
 
+# TODO(lml): supporting control flow, nested blocks, and block other than current block of main program.
 def _lower(block, reverse):
     # Some functions which are only used in _lower.
     def bind(args, to_bind, value_table):
@@ -430,10 +433,6 @@ def expand_nested_list(xs):
     # Step1: Do some preparatory work for lower
     lower_fn = _prim2orig if reverse else _orig2prim
     lookup_fn = lookup_prim2orig if reverse else lookup_orig2prim
-    if block is None:
-        program = default_main_program()
-        assert program.num_blocks == 1, "The lower transform is designed to process only one block."
-        block = program.current_block()
 
     value_table = {}
     to_bind = {}
@@ -516,6 +515,7 @@ def orig2prim(block=None):
     """ 
     .. note::
         **This API is ONLY available in the static mode.**
+        **Args block must be None or current block of main program.**
 
     All operators in the target block are processed as follows.
     If it is an original operator, it will be transformed into
@@ -523,13 +523,14 @@ def orig2prim(block=None):
     equivalent function.
     
     Args:
-        block(paddle.fluid.framework.Variable|None, optional): The
+        block(paddle.static.Block|None, optional): The
             target block to process on. Default None, and will
             process on the current block of main program.
-    
-    Returns:
-        None
     """
+
+    block = default_main_program().current_block() if block is None else block
+    assert block == default_main_program().current_block(
+    ), f'block is neither None nor current block of main program'
     _lower(block, reverse=False)
 
 
@@ -538,6 +539,7 @@ def prim2orig(block=None):
     """
     .. note::
         **ONLY available in the static mode.**
+        **Args block must be None or current block of main program.**
 
     All operators in the target block are processed as follows.
     If it is an automatic differential basic operator, it will be
@@ -545,10 +547,10 @@ def prim2orig(block=None):
     equivalent function to support execution.
     
     Args:
-        block(paddle.static.Variable|None, optional): The
+        block(paddle.static.Block|None, optional): The
             target block to process on. Default None, and will
             process on the current block of main program.
-       
+    
     Examples:
 
         .. code-block:: python
@@ -566,6 +568,10 @@ def prim2orig(block=None):
             if prim_enabled():
                 prim2orig()
     """
+
+    block = default_main_program().current_block() if block is None else block
+    assert block == default_main_program().current_block(
+    ), f'block is neither None nor current block of main program'
     _lower(block, reverse=True)
 
 
@@ -583,7 +589,9 @@ def _gradients(ys, xs, ys_bar=None):
     """
 
     ys, xs = to_tensors(ys), to_tensors(xs)
-    block = ys[0].block
+    block = default_main_program().current_block()
+    for el in xs + ys:
+        assert el is None or el.block == block, f'variable in xs and ys should be None or in current block of main program'
     # TODO(Tongxin) without any prior knowledge about whether the program
     # is completely lowered to primitive ops, it's mandatory to run the lowering
     # pass once and again. This is obviously inefficient and needs to be 
diff --git a/python/paddle/incubate/nn/__init__.py b/python/paddle/incubate/nn/__init__.py
index 43fcabf97317e..cf15ee7d8ffaa 100644
--- a/python/paddle/incubate/nn/__init__.py
+++ b/python/paddle/incubate/nn/__init__.py
@@ -16,10 +16,14 @@
 from .layer.fused_transformer import FusedFeedForward  # noqa: F401
 from .layer.fused_transformer import FusedTransformerEncoderLayer  # noqa: F401
 from .layer.fused_transformer import FusedMultiTransformer  # noqa: F401
+from .layer.fused_linear import FusedLinear  # noqa: F401
+from .layer.fused_transformer import FusedBiasDropoutResidualLayerNorm  # noqa: F401
 
 __all__ = [  #noqa
     'FusedMultiHeadAttention',
     'FusedFeedForward',
     'FusedTransformerEncoderLayer',
     'FusedMultiTransformer',
+    'FusedLinear',
+    'FusedBiasDropoutResidualLayerNorm',
 ]
diff --git a/python/paddle/incubate/nn/functional/__init__.py b/python/paddle/incubate/nn/functional/__init__.py
index 4da090487785b..e9894990455ab 100644
--- a/python/paddle/incubate/nn/functional/__init__.py
+++ b/python/paddle/incubate/nn/functional/__init__.py
@@ -15,9 +15,14 @@
 from .fused_transformer import fused_multi_head_attention
 from .fused_transformer import fused_feedforward
 from .fused_transformer import fused_multi_transformer
+from .fused_matmul_bias import fused_matmul_bias, fused_linear
+from .fused_transformer import fused_bias_dropout_residual_layer_norm
 
 __all__ = [
     'fused_multi_head_attention',
     'fused_feedforward',
     'fused_multi_transformer',
+    'fused_matmul_bias',
+    'fused_linear',
+    'fused_bias_dropout_residual_layer_norm',
 ]
diff --git a/python/paddle/incubate/nn/functional/fused_matmul_bias.py b/python/paddle/incubate/nn/functional/fused_matmul_bias.py
new file mode 100644
index 0000000000000..bcc2e62144589
--- /dev/null
+++ b/python/paddle/incubate/nn/functional/fused_matmul_bias.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.framework import _non_static_mode
+from paddle.tensor.linalg import matmul
+from paddle import _C_ops
+
+
+def fused_matmul_bias(x,
+                      y,
+                      bias=None,
+                      transpose_x=False,
+                      transpose_y=False,
+                      name=None):
+    """
+    Applies matrix multiplication of two tensors and then bias addition if provided.
+    This method requires CUDA version >= 11.6. 
+
+    Args:
+        x (Tensor): the first input Tensor to be multiplied.
+        y (Tensor): the second input Tensor to be multiplied. Its rank must be 2.  
+        bias (Tensor|None): the input bias Tensor. If it is None, no bias addition would
+            be performed. Otherwise, the bias is added to the matrix multiplication result.  
+        transpose_x (bool): Whether to transpose :math:`x` before multiplication.
+        transpose_y (bool): Whether to transpose :math:`y` before multiplication.    
+        name(str|None): For detailed information, please refer to 
+            :ref:`api_guide_Name` . Usually name is no need to set and None by default. 
+
+    Returns:
+        Tensor: the output Tensor. 
+
+    Examples:
+        .. code-block:: python
+
+            # required: gpu
+            import paddle
+            from paddle.incubate.nn.functional import fused_matmul_bias
+            
+            x = paddle.randn([3, 4]) 
+            y = paddle.randn([4, 5])
+            bias = paddle.randn([5])
+            out = fused_matmul_bias(x, y, bias) 
+            print(out.shape) # [3, 5]
+    """
+    if bias is None:
+        return matmul(x, y, transpose_x, transpose_y, name)
+    if _non_static_mode():
+        return _C_ops.fused_gemm_epilogue(x, y, bias, 'trans_x', transpose_x,
+                                          'trans_y', transpose_y)
+
+    helper = LayerHelper('fused_matmul_bias', **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type='fused_gemm_epilogue',
+        inputs={'X': x,
+                'Y': y,
+                'Bias': bias},
+        outputs={'Out': out},
+        attrs={'trans_x': transpose_x,
+               'trans_y': transpose_y})
+    return out
+
+
+def fused_linear(x, weight, bias=None, transpose_weight=False, name=None):
+    """
+    Fully-connected linear transformation operator. This method requires CUDA version >= 11.6. 
+
+    Args:
+        x (Tensor): the input Tensor to be multiplied.
+        weight (Tensor): the weight Tensor to be multiplied. Its rank must be 2.  
+        bias (Tensor|None): the input bias Tensor. If it is None, no bias addition would
+            be performed. Otherwise, the bias is added to the matrix multiplication result.  
+        transpose_weight (bool): Whether to transpose :math:`weight` before multiplication.    
+        name(str|None): For detailed information, please refer to 
+            :ref:`api_guide_Name` . Usually name is no need to set and None by default. 
+
+    Returns:
+        Tensor: the output Tensor. 
+
+    Examples:
+        .. code-block:: python
+
+            # required: gpu
+            import paddle
+            from paddle.incubate.nn.functional import fused_linear
+            
+            x = paddle.randn([3, 4]) 
+            weight = paddle.randn([4, 5])
+            bias = paddle.randn([5])
+            out = fused_linear(x, weight, bias) 
+            print(out.shape) # [3, 5]
+    """
+    return fused_matmul_bias(x, weight, bias, False, transpose_weight, name)
diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py
index 3e263f1c6d3ae..232e16415a5f7 100644
--- a/python/paddle/incubate/nn/functional/fused_transformer.py
+++ b/python/paddle/incubate/nn/functional/fused_transformer.py
@@ -212,6 +212,151 @@ def fused_feedforward(x,
     return out
 
 
+def fused_bias_dropout_residual_layer_norm(x,
+                                           residual,
+                                           bias=None,
+                                           ln_scale=None,
+                                           ln_bias=None,
+                                           dropout_rate=0.5,
+                                           ln_epsilon=1e-5,
+                                           training=True,
+                                           mode='upscale_in_train',
+                                           name=None):
+    r"""
+    The fused_bias_dropout_residual_layer_norm operator. The pseudo code is as follows:
+
+    .. code-block:: python
+        y = layer_norm(residual + dropout(bias + x))
+
+    Parameters:
+        x (Tensor): The input tensor. The shape is `[*, embed\_dim]`.
+        residual (Tensor): The residual tensor. The shape is same as x.
+        bias (Tensor, optional): The bias of linear. The shape is `[embed_dim]`. Default None.
+        ln_scale (Tensor, optional): The weight tensor of layernorm. The shape is `[embed_dim]`. Default None.
+        ln_bias (Tensor, optional): The bias tensor of layernorm. The shape is `[embed_dim]`. Default None.
+        dropout_rate (float, optional): The dropout probability used on attention
+            weights to drop some attention targets for the dropout after attention.
+            0 for no dropout. Default 0.5.
+        ln_epsilon (float, optional): Small float value added to denominator of layer_norm
+            to avoid dividing by zero. Default is 1e-5.
+        training (bool, optional): A flag indicating whether it is in train phrase or not. Default True.
+        mode (str, optional): ['upscale_in_train'(default) | 'downscale_in_infer']
+
+                               1. upscale_in_train(default), upscale the output at training time
+
+                                  - train: out = input * mask / ( 1.0 - p )
+                                  - inference: out = input
+
+                               2. downscale_in_infer, downscale the output at inference
+
+                                  - train: out = input * mask
+                                  - inference: out = input * (1.0 - p)
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: The output Tensor, the data type and shape is same as `x`.
+
+    Examples:
+
+        .. code-block:: python
+
+            # required: gpu
+            import paddle
+            import paddle.incubate.nn.functional as F
+
+            # input: [batch_size, seq_len, embed_dim]
+            x = paddle.rand(shape=(2, 4, 128), dtype="float32")
+            # residual: [batch_size, seq_len, embed_dim]
+            residual = paddle.rand(shape=(2, 4, 128), dtype="float32")
+            # linear bias: [embed_dim]
+            bias = paddle.rand(shape=[128], dtype="float32")
+            # output: [batch_size, seq_len, embed_dim]
+            output = F.fused_bias_dropout_residual_layer_norm(
+                x, residual, bias)
+            # [2, 4, 128]
+            print(output.shape)
+    """
+    seed = None
+    if mode not in ('downscale_in_infer', 'upscale_in_train'):
+        raise ValueError(
+            "mode argument should be 'downscale_in_infer' or 'upscale_in_train'")
+    mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
+
+    if ln_scale is not None:
+        assert len(ln_scale.
+                   shape) == 1, "The dims of the shape of ln_scale should be 1."
+        assert x.shape[len(x.shape) - 1] == ln_scale.shape[
+            0], "The dim of ln_scale must equal to the last dim of x."
+    if ln_bias is not None:
+        assert len(
+            ln_bias.shape) == 1, "The dims of the shape of ln_bias should be 1."
+        assert x.shape[len(x.shape) - 1] == ln_bias.shape[
+            0], "The dim of ln_bias must equal to the last dim of x."
+
+    if _non_static_mode():
+        if default_main_program().random_seed != 0:
+            seed = default_main_program().random_seed
+        _, _, _, _, final_out = _C_ops.fused_bias_dropout_residual_layer_norm(
+            x, residual, bias, ln_scale, ln_bias, 'dropout_rate', dropout_rate,
+            'ln_epsilon', ln_epsilon, 'is_test', not training,
+            'dropout_fix_seed', seed is not None, 'dropout_seed', seed
+            if seed is not None else 0, 'dropout_implementation', mode)
+        return final_out
+    else:
+        helper = LayerHelper('fused_bias_dropout_residual_layer_norm',
+                             **locals())
+        dtype = x.dtype
+        # check dtypes
+        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                                 'fused_bias_dropout_residual_layer_norm')
+        check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'],
+                    'fused_bias_dropout_residual_layer_norm')
+        # set inputs
+        inputs = dict()
+        inputs['X'] = [x]
+        inputs['Residual'] = [residual]
+        if bias is not None:
+            inputs['Bias'] = [bias]
+        if ln_scale:
+            inputs['LnScale'] = [ln_scale]
+        if ln_bias:
+            inputs['LnBias'] = [ln_bias]
+        if (seed is None or seed == 0) and helper.main_program.random_seed != 0:
+            seed = helper.main_program.random_seed
+        # set attrs
+        attrs = {
+            'ln_epsilon': ln_epsilon,
+            'dropout_rate': dropout_rate,
+            'is_test': not training,
+            'dropout_fix_seed': seed is not None,
+            'dropout_seed': seed if seed is not None else 0,
+            'dropout_implementation': mode,
+        }
+        # set outputs
+        dropout_mask_out = helper.create_variable_for_type_inference(
+            dtype=core.VarDesc.VarType.UINT8, stop_gradient=True)
+        ln_mean_out = helper.create_variable_for_type_inference(
+            dtype=dtype, stop_gradient=True)
+        ln_variance_out = helper.create_variable_for_type_inference(
+            dtype=dtype, stop_gradient=True)
+        bias_dropout_residual_out = helper.create_variable_for_type_inference(
+            dtype=dtype)
+        final_out = helper.create_variable_for_type_inference(dtype=dtype)
+
+        helper.append_op(
+            type='fused_bias_dropout_residual_layer_norm',
+            inputs=inputs,
+            outputs={
+                "BiasDropoutResidualOut": bias_dropout_residual_out,
+                "DropoutMaskOut": dropout_mask_out,
+                "LnMean": ln_mean_out,
+                "LnVariance": ln_variance_out,
+                'Y': final_out,
+            },
+            attrs=attrs)
+        return final_out
+
+
 def fused_multi_head_attention(x,
                                qkv_weight,
                                linear_weight,
@@ -368,10 +513,9 @@ def fused_multi_head_attention(x,
             attn_mask, linear_weight, linear_bias, ln_scale, ln_bias,
             'pre_layer_norm', pre_layer_norm, 'epsilon', pre_ln_epsilon,
             'dropout_rate', dropout_rate, 'attn_dropout_rate',
-            attn_dropout_rate, 'ln_epsilon', ln_epsilon, 'attn_dropout_is_test',
-            not training, 'dropout_is_test', not training,
-            'attn_dropout_fix_seed', seed is not None, 'dropout_fix_seed',
-            seed is not None, 'attn_dropout_seed', seed
+            attn_dropout_rate, 'ln_epsilon', ln_epsilon, 'is_test',
+            not training, 'attn_dropout_fix_seed', seed is not None,
+            'dropout_fix_seed', seed is not None, 'attn_dropout_seed', seed
             if seed is not None else 0, 'dropout_seed', seed
             if seed is not None else 0, 'attn_dropout_implementation', mode,
             'dropout_implementation', mode, 'ring_id', ring_id)
@@ -417,8 +561,7 @@ def fused_multi_head_attention(x,
             'ln_epsilon': ln_epsilon,
             'dropout_rate': dropout_rate,
             'attn_dropout_rate': attn_dropout_rate,
-            'attn_dropout_is_test': not training,
-            'dropout_is_test': not training,
+            'is_test': not training,
             'attn_dropout_fix_seed': seed is not None,
             'dropout_fix_seed': seed is not None,
             'attn_dropout_seed': seed if seed is not None else 0,
@@ -656,7 +799,7 @@ def fused_multi_transformer(x,
             time_step, attn_mask, linear_weights, linear_biases, ffn_ln_scales,
             ffn_ln_biases, ffn1_weights, ffn1_biases, ffn2_weights, ffn2_biases,
             cache_kvs, 'pre_layer_norm', pre_layer_norm, 'epsilon', epsilon,
-            'dropout_rate', dropout_rate, 'dropout_is_test', not training,
+            'dropout_rate', dropout_rate, 'is_test', not training,
             'dropout_implementation', mode, 'act_method', activation, 'ring_id',
             ring_id)
         if cache_kvs is not None:
@@ -703,7 +846,7 @@ def fused_multi_transformer(x,
             'pre_layer_norm': pre_layer_norm,
             'epsilon': epsilon,
             'dropout_rate': dropout_rate,
-            'dropout_is_test': not training,
+            'is_test': not training,
             'dropout_implementation': mode,
             'act_method': activation,
             'ring_id': ring_id
diff --git a/python/paddle/incubate/nn/layer/fused_linear.py b/python/paddle/incubate/nn/layer/fused_linear.py
new file mode 100644
index 0000000000000..f7c872c3993cf
--- /dev/null
+++ b/python/paddle/incubate/nn/layer/fused_linear.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.nn import Layer
+from paddle.incubate.nn import functional as F
+
+
+class FusedLinear(Layer):
+    """
+    Linear layer takes only one multi-dimensional tensor as input with the
+    shape :math:`[batch\_size, *, in\_features]` , where :math:`*` means any
+    number of additional dimensions. It multiplies input tensor with the weight
+    (a 2-D tensor of shape :math:`[in\_features, out\_features]` ) and produces
+    an output tensor of shape :math:`[batch\_size, *, out\_features]` .
+    If :math:`bias\_attr` is not False, the bias (a 1-D tensor of
+    shape :math:`[out\_features]` ) will be created and added to the output.
+
+    Parameters:
+        in_features (int): The number of input units.
+        out_features (int): The number of output units.
+        weight_attr (ParamAttr, optional): The attribute for the learnable
+            weight of this layer. The default value is None and the weight will be
+            initialized to zero. For detailed information, please refer to
+            paddle.ParamAttr.
+        transpose_weight (bool): Whether to transpose the `weight` Tensor before
+            multiplication. 
+        bias_attr (ParamAttr|bool, optional): The attribute for the learnable bias
+            of this layer. If it is set to False, no bias will be added to the output.
+            If it is set to None or one kind of ParamAttr, a bias parameter will
+            be created according to ParamAttr. For detailed information, please refer
+            to paddle.ParamAttr. The default value is None and the bias will be
+            initialized to zero.
+        name (str, optional): Normally there is no need for user to set this parameter.
+            For detailed information, please refer to :ref:`api_guide_Name` .
+
+    Attribute:
+        **weight** (Parameter): the learnable weight of this layer.
+
+        **bias** (Parameter): the learnable bias of this layer.
+
+    Shape:
+        - input: Multi-dimentional tensor with shape :math:`[batch\_size, *, in\_features]` .
+        - output: Multi-dimentional tensor with shape :math:`[batch\_size, *, out\_features]` .
+
+    Examples:
+        .. code-block:: python
+       
+            # required: gpu
+            import paddle
+            from paddle.incubate.nn import FusedLinear
+
+            x = paddle.randn([3, 4]) 
+            linear = FusedLinear(4, 5)
+            y = linear(x)            
+            print(y.shape) # [3, 5]
+    """
+
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 weight_attr=None,
+                 bias_attr=None,
+                 transpose_weight=False,
+                 name=None):
+        super(FusedLinear, self).__init__()
+        if transpose_weight:
+            weight_shape = [out_features, in_features]
+        else:
+            weight_shape = [in_features, out_features]
+        dtype = self._helper.get_default_dtype()
+        self.weight = self.create_parameter(
+            shape=weight_shape, attr=weight_attr, dtype=dtype, is_bias=False)
+        self.bias = self.create_parameter(
+            shape=[out_features], attr=bias_attr, dtype=dtype, is_bias=True)
+        self.transpose_weight = transpose_weight
+        self.name = name
+
+    def forward(self, input):
+        return F.fused_linear(input, self.weight, self.bias,
+                              self.transpose_weight, self.name)
diff --git a/python/paddle/incubate/nn/layer/fused_transformer.py b/python/paddle/incubate/nn/layer/fused_transformer.py
index 072c7d9fccade..a64b7e506021c 100644
--- a/python/paddle/incubate/nn/layer/fused_transformer.py
+++ b/python/paddle/incubate/nn/layer/fused_transformer.py
@@ -36,6 +36,103 @@ def _set_var_distributed(var):
     main_block._find_var_recursive(var.name).is_distributed = True
 
 
+class FusedBiasDropoutResidualLayerNorm(Layer):
+    """
+    Applies fused_bias_dropout_residual_layer_norm operation.
+
+    Parameters:
+        embed_dim (int): The expected feature size in the input and output.
+        dropout_rate (float, optional): The dropout probability used on attention
+            weights to drop some attention targets for the dropout after attention.
+            0 for no dropout. Default 0.5.
+        bias_attr (ParamAttr|bool, optional): To specify the bias parameter property.
+            Default: None, which means the default bias parameter property is used.
+            If it is set to False, this layer will not have trainable bias parameter.
+            See usage for details in :code:`ParamAttr`.
+        epsilon (float, optional): The small value added to the variance to prevent
+            division by zero. Default: 1e-05.
+
+    Examples:
+
+        .. code-block:: python
+
+            # required: gpu
+            import paddle
+            # input: [batch_size, seq_len, embed_dim]
+            x = paddle.rand((2, 4, 128))
+            # residual: [batch_size, seq_len, embed_dim]
+            residual = paddle.rand((2, 4, 128))
+            fused_bias_dropout_residual_ln = paddle.incubate.nn.FusedBiasDropoutResidualLayerNorm(128)
+            output = fused_bias_dropout_residual_ln(x, residual)  # [2, 4, 128]
+    """
+
+    def __init__(self,
+                 embed_dim,
+                 dropout_rate=0.5,
+                 weight_attr=None,
+                 bias_attr=None,
+                 epsilon=1e-5,
+                 name=None):
+        super(FusedBiasDropoutResidualLayerNorm, self).__init__()
+        assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
+                               "but recieved {}".format(embed_dim))
+        self._dtype = self._helper.get_default_dtype()
+        self._bias_attr = bias_attr
+        self._weight_attr = weight_attr
+        self.embed_dim = embed_dim
+        self.linear_bias = self.create_parameter(
+            shape=[embed_dim],
+            attr=self._bias_attr,
+            dtype=self._dtype,
+            is_bias=True)
+        self.ln_scale = self.create_parameter(
+            attr=self._weight_attr,
+            shape=[embed_dim],
+            default_initializer=Constant(value=1.0))
+        self.ln_bias = self.create_parameter(
+            attr=self._bias_attr, shape=[embed_dim], is_bias=True)
+        self.dropout_rate = dropout_rate
+        self._epsilon = epsilon
+
+        self.name = name
+
+    def forward(self, x, residual):
+        """
+        Applies fused_bias_dropout_residual_layer_norm operation.
+
+        Parameters:
+            x (Tensor): The input tensor. It is a tensor with shape 
+                `[batch_size, seq_len, embed_dim]`. The data type should be 
+                float32 or float64. 
+            residual (Tensor, optional): The residual tensor. It is a tensor 
+                with shape `[batch_size, value_length, vdim]`. The data type 
+                should be float32 or float64. 
+
+        Returns:
+            Tensor|tuple: It is a tensor that has the same shape and data type \
+                as `x`.
+        """
+
+        out = incubate_f.fused_bias_dropout_residual_layer_norm(
+            x=x,
+            residual=residual,
+            bias=self.linear_bias,
+            ln_scale=self.ln_scale,
+            ln_bias=self.ln_bias,
+            dropout_rate=self.dropout_rate,
+            ln_epsilon=self._epsilon,
+            training=self.training,
+            mode='upscale_in_train',
+            name=self.name)
+        return out
+
+    def extra_repr(self):
+        name_str = ', name={}'.format(self.name) if self.name else ''
+        return 'embed_dim={}, seq_len={}, dropout_rate={}, epsilon={}, dtype={}{}'.format(
+            self.embed_dim, self.seq_len, self.dropout_rate, self._epsilon,
+            self._dtype, name_str)
+
+
 class FusedMultiHeadAttention(Layer):
     """
     Attention mapps queries and a set of key-value pairs to outputs, and
diff --git a/python/paddle/sparse/layer/__init__.py b/python/paddle/incubate/sparse/__init__.py
similarity index 69%
rename from python/paddle/sparse/layer/__init__.py
rename to python/paddle/incubate/sparse/__init__.py
index 8a814b514276f..c499c017a48e8 100644
--- a/python/paddle/sparse/layer/__init__.py
+++ b/python/paddle/incubate/sparse/__init__.py
@@ -12,10 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .unary import ReLU
-from .norm import BatchNorm
-from .conv import Conv3D
-from .conv import SubmConv3D
-from .pooling import MaxPool3D
+from .creation import sparse_coo_tensor
+from .creation import sparse_csr_tensor
 
-__all__ = []
+from .unary import sqrt
+from .unary import sin
+from .unary import tanh
+
+from . import nn
+
+__all__ = [
+    'sparse_coo_tensor',
+    'sparse_csr_tensor',
+    'sqrt',
+    'sin',
+    'tanh',
+]
diff --git a/python/paddle/sparse/creation.py b/python/paddle/incubate/sparse/creation.py
similarity index 96%
rename from python/paddle/sparse/creation.py
rename to python/paddle/incubate/sparse/creation.py
index 2cfbb3144acc2..7c30910071ce2 100644
--- a/python/paddle/sparse/creation.py
+++ b/python/paddle/incubate/sparse/creation.py
@@ -14,11 +14,10 @@
 
 import paddle
 from paddle import _C_ops
-from ..framework import core, dygraph_only
-from ..framework import _current_expected_place, _get_paddle_place
-from ..tensor import to_tensor
-from ..tensor import max
-from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
+from paddle.fluid.framework import core, dygraph_only
+from paddle.fluid.framework import _current_expected_place, _get_paddle_place
+from paddle.tensor import to_tensor, max
+from paddle.fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
 
 import numpy as np
 
@@ -112,7 +111,7 @@ def sparse_coo_tensor(indices,
             indices = [[0, 1, 2], [1, 2, 0]]
             values = [1.0, 2.0, 3.0]
             dense_shape = [3, 3]
-            coo = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape)
+            coo = paddle.incubate.sparse.sparse_coo_tensor(indices, values, dense_shape)
             # print(coo)
             # Tensor(shape=[2, 3], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True,
             #       indices=[[0, 1, 2],
@@ -222,7 +221,7 @@ def sparse_csr_tensor(crows,
             cols = [1, 3, 2, 0, 1]
             values = [1, 2, 3, 4, 5]
             dense_shape = [3, 4]
-            csr = paddle.sparse.sparse_csr_tensor(crows, cols, values, dense_shape)
+            csr = paddle.incubate.sparse.sparse_csr_tensor(crows, cols, values, dense_shape)
             # print(csr)
             # Tensor(shape=[3, 4], dtype=paddle.int64, place=Place(gpu:0), stop_gradient=True,
             #       crows=[0, 2, 3, 5],
diff --git a/python/paddle/sparse/__init__.py b/python/paddle/incubate/sparse/nn/__init__.py
similarity index 59%
rename from python/paddle/sparse/__init__.py
rename to python/paddle/incubate/sparse/nn/__init__.py
index 26a2f0cfadbe7..be4985e694b4b 100644
--- a/python/paddle/sparse/__init__.py
+++ b/python/paddle/incubate/sparse/nn/__init__.py
@@ -12,21 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .creation import sparse_coo_tensor
-from .creation import sparse_csr_tensor
-from .layer import ReLU
-from .layer import BatchNorm
+from . import functional
 
-from .layer import Conv3D
-from .layer import SubmConv3D
-
-from .layer import MaxPool3D
-
-from .functional import sqrt
-from .functional import sin
-from .functional import tanh
+from .layer.activation import ReLU
+from .layer.norm import BatchNorm
+from .layer.conv import Conv3D
+from .layer.conv import SubmConv3D
+from .layer.pooling import MaxPool3D
 
 __all__ = [
-    'sparse_coo_tensor', 'sparse_csr_tensor', 'ReLU', 'Conv3D', 'SubmConv3D',
-    'BatchNorm', 'MaxPool3D', 'sqrt', 'sin', 'tanh'
+    'ReLU',
+    'BatchNorm',
+    'Conv3D',
+    'SubmConv3D',
+    'MaxPool3D',
 ]
diff --git a/python/paddle/sparse/functional/__init__.py b/python/paddle/incubate/sparse/nn/functional/__init__.py
similarity index 76%
rename from python/paddle/sparse/functional/__init__.py
rename to python/paddle/incubate/sparse/nn/functional/__init__.py
index cfefa3ff4ff76..a16a8a8240a23 100644
--- a/python/paddle/sparse/functional/__init__.py
+++ b/python/paddle/incubate/sparse/nn/functional/__init__.py
@@ -12,12 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .unary import relu  # noqa: F401
-from .unary import tanh  # noqa: F401
-from .unary import sqrt  # noqa: F401
-from .unary import sin  # noqa: F401
 from .conv import conv3d  # noqa: F401
 from .conv import subm_conv3d  # noqa: F401
 from .pooling import max_pool3d  # noqa: F401
+from .activation import relu  # noqa: F401
 
-__all__ = ['relu', 'tanh', 'conv3d', 'subm_conv3d', 'max_pool3d', 'sqrt', 'sin']
+__all__ = [
+    'conv3d',
+    'subm_conv3d',
+    'max_pool3d',
+    'relu',
+]
diff --git a/python/paddle/incubate/sparse/nn/functional/activation.py b/python/paddle/incubate/sparse/nn/functional/activation.py
new file mode 100644
index 0000000000000..3396cc53cc479
--- /dev/null
+++ b/python/paddle/incubate/sparse/nn/functional/activation.py
@@ -0,0 +1,55 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = []
+
+from paddle import _C_ops, in_dynamic_mode
+
+
+def relu(x, name=None):
+    """
+    sparse relu activation, requiring x to be a sparse coo or sparse csr tensor.
+
+    .. math::
+
+        out = max(x, 0)
+
+    Parameters:
+        x (Tensor): The input Sparse Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Sparse Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.fluid.framework import _test_eager_guard
+
+            with _test_eager_guard():
+                dense_x = paddle.to_tensor([-2, 0, 1], dtype='float32')
+                sparse_x = dense_x.to_sparse_coo(1)
+                out = paddle.incubate.sparse.nn.functional.relu(sparse_x) 
+    """
+
+    assert in_dynamic_mode(), "Currently, Sparse API only support dynamic mode"
+
+    if x.is_sparse_coo() or x.is_sparse_csr():
+        return _C_ops.final_state_sparse_relu(x)
+    else:
+        raise ValueError(
+            "Currently, sparse.relu only support the input of SparseCooTensor or SparseCsrTensor"
+        )
diff --git a/python/paddle/sparse/functional/conv.py b/python/paddle/incubate/sparse/nn/functional/conv.py
similarity index 96%
rename from python/paddle/sparse/functional/conv.py
rename to python/paddle/incubate/sparse/nn/functional/conv.py
index 42b7b49835cf0..d67d67e8d74cf 100644
--- a/python/paddle/sparse/functional/conv.py
+++ b/python/paddle/incubate/sparse/nn/functional/conv.py
@@ -15,9 +15,9 @@
 __all__ = []
 
 from paddle import _C_ops, in_dynamic_mode
-from ...fluid.layers.utils import convert_to_list
-from ...fluid.layers.nn import elementwise_add
-from .. import sparse_coo_tensor
+from paddle.fluid.layers.utils import convert_to_list
+from paddle.fluid.layers.nn import elementwise_add
+from ...creation import sparse_coo_tensor
 from paddle.nn.functional.conv import _update_padding_nd
 
 
@@ -180,9 +180,9 @@ def conv3d(x,
               indices = paddle.to_tensor(indices, dtype='int32')
               values = paddle.to_tensor(values, dtype='float32')
               dense_shape = [1, 1, 3, 4, 1]
-              sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True) 
+              sparse_x = paddle.incubate.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True) 
               weight = paddle.randn((1, 3, 3, 1, 1), dtype='float32')
-              y = paddle.sparse.functional.conv3d(sparse_x, weight)
+              y = paddle.incubate.sparse.nn.functional.conv3d(sparse_x, weight)
               print(y.shape)
               # (1, 1, 1, 2, 1)
     """
@@ -295,9 +295,9 @@ def subm_conv3d(x,
               indices = paddle.to_tensor(indices, dtype='int32')
               values = paddle.to_tensor(values, dtype='float32')
               dense_shape = [1, 1, 3, 4, 1]
-              sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True) 
+              sparse_x = paddle.incubate.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True) 
               weight = paddle.randn((1, 3, 3, 1, 1), dtype='float32')
-              y = paddle.sparse.functional.subm_conv3d(sparse_x, weight)
+              y = paddle.incubate.sparse.nn.functional.subm_conv3d(sparse_x, weight)
               print(y.shape)
               #(1, 1, 3, 4, 1)
     """
diff --git a/python/paddle/sparse/functional/pooling.py b/python/paddle/incubate/sparse/nn/functional/pooling.py
similarity index 96%
rename from python/paddle/sparse/functional/pooling.py
rename to python/paddle/incubate/sparse/nn/functional/pooling.py
index ab5106b31689d..0c0b0cbadad25 100644
--- a/python/paddle/sparse/functional/pooling.py
+++ b/python/paddle/incubate/sparse/nn/functional/pooling.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ...fluid.layers import utils
+from paddle.fluid.layers import utils
 from paddle import _C_ops, in_dynamic_mode
 from paddle.nn.functional.pooling import _update_padding_nd
 
@@ -70,7 +70,7 @@ def max_pool3d(x,
                 kernel_sizes = [3, 3, 3]
                 paddings = [0, 0, 0]
                 strides = [1, 1, 1]
-                out = paddle.sparse.functional.max_pool3d(sparse_x, kernel_sizes, stride=strides, padding=paddings)
+                out = paddle.incubate.sparse.nn.functional.max_pool3d(sparse_x, kernel_sizes, stride=strides, padding=paddings)
                 #[1, 2, 2, 2, 3]
     """
 
diff --git a/python/paddle/sparse/layer/unary.py b/python/paddle/incubate/sparse/nn/layer/activation.py
similarity index 97%
rename from python/paddle/sparse/layer/unary.py
rename to python/paddle/incubate/sparse/nn/layer/activation.py
index ad0dbc1880782..75285eb11adc2 100644
--- a/python/paddle/sparse/layer/unary.py
+++ b/python/paddle/incubate/sparse/nn/layer/activation.py
@@ -44,7 +44,7 @@ class ReLU(Layer):
                 dense_x = paddle.to_tensor(x, dtype='float32')
                 sparse_dim = 2
                 sparse_x = dense_x.to_sparse_coo(sparse_dim)
-                relu = paddle.sparse.ReLU()
+                relu = paddle.incubate.sparse.nn.ReLU()
                 out = relu(sparse_x)
                 #out.values: [0., 2., 0., 4., 5.]
     """
diff --git a/python/paddle/sparse/layer/conv.py b/python/paddle/incubate/sparse/nn/layer/conv.py
similarity index 97%
rename from python/paddle/sparse/layer/conv.py
rename to python/paddle/incubate/sparse/nn/layer/conv.py
index ff421a06a1344..e00ca78f784f3 100644
--- a/python/paddle/sparse/layer/conv.py
+++ b/python/paddle/incubate/sparse/nn/layer/conv.py
@@ -16,8 +16,8 @@
 from .. import functional as F
 from paddle.nn import Layer
 from paddle.nn.initializer import Normal
-from ..functional.conv import _update_padding_nd
-from ...fluid.layers import utils
+from paddle.nn.functional.conv import _update_padding_nd
+from paddle.fluid.layers import utils
 
 __all__ = []
 
@@ -213,8 +213,8 @@ class Conv3D(_Conv3D):
             indices = paddle.to_tensor(indices, dtype='int32')
             values = paddle.to_tensor(values, dtype='float32')
             dense_shape = [1, 1, 3, 4, 1]
-            sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True) 
-            conv = paddle.sparse.Conv3D(1, 1, (1, 3, 3))
+            sparse_x = paddle.incubate.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True) 
+            conv = paddle.incubate.sparse.nn.Conv3D(1, 1, (1, 3, 3))
             y = conv(sparse_x)
             print(y.shape)
             # (1, 1, 1, 2, 1)
@@ -346,8 +346,8 @@ class SubmConv3D(_Conv3D):
             dense_shape = [1, 1, 3, 4, 1]
             indices = paddle.to_tensor(indices, dtype='int32')
             values = paddle.to_tensor(values, dtype='float32')
-            sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True) 
-            subm_conv = paddle.sparse.SubmConv3D(1, 1, (1, 3, 3))
+            sparse_x = paddle.incubate.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True) 
+            subm_conv = paddle.incubate.sparse.nn.SubmConv3D(1, 1, (1, 3, 3))
             y = subm_conv(sparse_x)
             print(y.shape)
             # (1, 1, 3, 4, 1)
diff --git a/python/paddle/sparse/layer/norm.py b/python/paddle/incubate/sparse/nn/layer/norm.py
similarity index 98%
rename from python/paddle/sparse/layer/norm.py
rename to python/paddle/incubate/sparse/nn/layer/norm.py
index 83b738a5dc354..1a9b1f15e4cf1 100644
--- a/python/paddle/sparse/layer/norm.py
+++ b/python/paddle/incubate/sparse/nn/layer/norm.py
@@ -100,7 +100,7 @@ class BatchNorm(paddle.nn.BatchNorm1D):
               x_data = paddle.randn((1, 6, 6, 6, channels)).astype('float32')
               dense_x = paddle.to_tensor(x_data) 
               sparse_x = dense_x.to_sparse_coo(4)
-              batch_norm = paddle.sparse.BatchNorm(channels)
+              batch_norm = paddle.incubate.sparse.nn.BatchNorm(channels)
               batch_norm_out = batch_norm(sparse_x)
               print(batch_norm_out.shape)
               # [1, 6, 6, 6, 3]
@@ -153,7 +153,7 @@ def forward(self, input):
             data_format='NC',
             use_global_stats=self._use_global_stats)
 
-        return paddle.sparse.sparse_coo_tensor(
+        return paddle.incubate.sparse.sparse_coo_tensor(
             input.indices(),
             batch_norm_out,
             shape=input.shape,
diff --git a/python/paddle/sparse/layer/pooling.py b/python/paddle/incubate/sparse/nn/layer/pooling.py
similarity index 98%
rename from python/paddle/sparse/layer/pooling.py
rename to python/paddle/incubate/sparse/nn/layer/pooling.py
index 9cfe463eed577..98be6e125f44d 100644
--- a/python/paddle/sparse/layer/pooling.py
+++ b/python/paddle/incubate/sparse/nn/layer/pooling.py
@@ -66,7 +66,7 @@ class MaxPool3D(Layer):
             with _test_eager_guard():
                 dense_x = paddle.randn((2, 3, 6, 6, 3))
                 sparse_x = dense_x.to_sparse_coo(4)
-                max_pool3d = paddle.sparse.MaxPool3D(
+                max_pool3d = paddle.incubate.sparse.nn.MaxPool3D(
                     kernel_size=3, data_format='NDHWC')
                 out = max_pool3d(sparse_x)
                 #shape=[2, 1, 2, 2, 3]
diff --git a/python/paddle/sparse/functional/unary.py b/python/paddle/incubate/sparse/unary.py
similarity index 75%
rename from python/paddle/sparse/functional/unary.py
rename to python/paddle/incubate/sparse/unary.py
index 550e6a2a39261..85e4088de7d78 100644
--- a/python/paddle/sparse/functional/unary.py
+++ b/python/paddle/incubate/sparse/unary.py
@@ -17,44 +17,6 @@
 from paddle import _C_ops, in_dynamic_mode
 
 
-def relu(x, name=None):
-    """
-    sparse relu activation, requiring x to be a sparse coo or sparse csr tensor.
-
-    .. math::
-
-        out = max(x, 0)
-
-    Parameters:
-        x (Tensor): The input Sparse Tensor with data type float32, float64.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
-
-    Returns:
-        A Sparse Tensor with the same data type and shape as ``x`` .
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            from paddle.fluid.framework import _test_eager_guard
-
-            with _test_eager_guard():
-                dense_x = paddle.to_tensor([-2, 0, 1], dtype='float32')
-                sparse_x = dense_x.to_sparse_coo(1)
-                out = paddle.sparse.functional.relu(sparse_x) 
-    """
-
-    assert in_dynamic_mode(), "Currently, Sparse API only support dynamic mode"
-
-    if x.is_sparse_coo() or x.is_sparse_csr():
-        return _C_ops.final_state_sparse_relu(x)
-    else:
-        raise ValueError(
-            "Currently, sparse.relu only support the input of SparseCooTensor or SparseCsrTensor"
-        )
-
-
 def tanh(x, name=None):
     """
     sparse tanh activation, requiring x to be a sparse coo or sparse csr tensor.
@@ -80,7 +42,7 @@ def tanh(x, name=None):
             with _test_eager_guard():
                 dense_x = paddle.to_tensor([-2, 0, 1], dtype='float32')
                 sparse_x = dense_x.to_sparse_coo(1)
-                out = paddle.sparse.tanh(sparse_x)
+                out = paddle.incubate.sparse.tanh(sparse_x)
     """
 
     assert in_dynamic_mode(), "Currently, Sparse API only support dynamic mode"
@@ -118,7 +80,7 @@ def sqrt(x, name=None):
             with _test_eager_guard():
                 dense_x = paddle.to_tensor([4, 0, 1], dtype='float32')
                 sparse_x = dense_x.to_sparse_coo(1)
-                out = paddle.sparse.sqrt(sparse_x)
+                out = paddle.incubate.sparse.sqrt(sparse_x)
     """
 
     assert in_dynamic_mode(), "Currently, Sparse API only support dynamic mode"
@@ -156,7 +118,7 @@ def sin(x, name=None):
             with _test_eager_guard():
                 dense_x = paddle.to_tensor([-2, 0, 3], dtype='float32')
                 sparse_x = dense_x.to_sparse_coo(1)
-                out = paddle.sparse.sin(sparse_x)
+                out = paddle.incubate.sparse.sin(sparse_x)
     """
 
     assert in_dynamic_mode(), "Currently, Sparse API only support dynamic mode"
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index dded115e2e740..9787a23c689c7 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -51,6 +51,7 @@
 from .layer.activation import ThresholdedReLU  # noqa: F401
 from .layer.activation import LogSoftmax  # noqa: F401
 from .layer.activation import Maxout  # noqa: F401
+from .layer.activation import RReLU  # noqa: F401
 from .layer.common import Pad1D  # noqa: F401
 from .layer.common import Pad2D  # noqa: F401
 from .layer.common import ZeroPad2D  # noqa: F401
@@ -314,5 +315,6 @@ def weight_norm(*args):
            'MaxUnPool3D',
            'HingeEmbeddingLoss',
            'Identity',
+           'RReLU',
            'MultiLabelSoftMarginLoss',
 ]
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index bd6f8503d5e79..b478a73eeacae 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -47,6 +47,7 @@
 from .activation import log_softmax  # noqa: F401
 from .activation import glu  # noqa: F401
 from .activation import gumbel_softmax  # noqa: F401
+from .activation import rrelu  # noqa: F401
 from .common import dropout  # noqa: F401
 from .common import dropout2d  # noqa: F401
 from .common import dropout3d  # noqa: F401
@@ -119,8 +120,8 @@
 from .vision import channel_shuffle  # noqa: F401
 from .input import one_hot  # noqa: F401
 from .input import embedding  # noqa: F401
-from ...fluid.layers import gather_tree  # noqa: F401
-from ...fluid.layers import temporal_shift  # noqa: F401
+from .extension import gather_tree  # noqa: F401
+from .extension import temporal_shift  # noqa: F401
 
 from .sparse_attention import sparse_attention
 
@@ -229,5 +230,6 @@
            'class_center_sample',
            'sparse_attention',
            'fold',
+           'rrelu',
            'multi_label_soft_margin_loss',
 ]
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index 6970cf4962909..dd314868b69e2 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ...fluid.layers import sigmoid  # noqa: F401
+from ...tensor.ops import sigmoid  # noqa: F401
 from ...tensor.math import tanh  # noqa: F401
 from ...tensor.math import tanh_  # noqa: F401
 
@@ -550,6 +550,122 @@ def prelu(x, weight, data_format="NCHW", name=None):
     return out
 
 
+def rrelu(x, lower=1. / 8., upper=1. / 3., training=True, name=None):
+    r"""
+    rrelu activation.
+
+    Applies the randomized leaky rectified liner unit function to improve generalization performance,
+    as described in the paper:
+    `Empirical Evaluation of Rectified Activations in Convolutional Network <https://arxiv.org/abs/1505.00853>`_
+
+    During training, randomly samples the negative slope for activation values as described below:
+
+    .. math::
+
+        rrelu(x)=
+            \left\{
+                \begin{array}{rcl}
+                    x, & & if \ x >= 0 \\
+                    a * x, & & otherwise \\
+                \end{array}
+            \right.
+
+    where :math:`x` is the input tensor,
+    :math:`a` is randomly sampled from uniform distribution in range (:math:`lower`, :math:`upper`),
+
+    In the test phase, the negative slope will take the average value of :math:`lower` and :math:`upper`:
+
+    .. math::
+
+        rrelu(x)=
+            \left\{
+                \begin{array}{rcl}
+                    x, & & if \ x >= 0 \\
+                    (lower + upper) * 0.5 * x, & & otherwise \\
+                \end{array}
+            \right.
+
+    where :math:`x` is the input tensor,
+    :math:`lower` and :math:`upper` are the bounds of uniform distribution.
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float16, float32, float64.
+        lower (float, optional): The lower bound of uniform distribution. Default: 0.125.
+        upper (float, optional): The upper bound of uniform distribution. Default: 0.333.
+        training (bool, optional): Current mode is in training or others.  Default is True.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+            :name: rrelu-example
+
+            import paddle
+            import paddle.nn.functional as F
+
+            input_tensor = paddle.to_tensor([[[[-2.0,  3.0, -4.0,  5.0],
+                                            [ 3.0, -4.0,  5.0, -6.0],
+                                            [-7.0, -8.0,  8.0,  9.0]],
+                                            [[ 1.0, -2.0, -3.0,  4.0],
+                                            [-5.0,  6.0,  7.0, -8.0],
+                                            [ 6.0,  7.0,  8.0,  9.0]]]], dtype='float32')
+
+            out = F.rrelu(input_tensor, 0.1, 0.3)
+            #[[[[-0.20000899  3.         -0.8810822   5.        ]
+            #   [ 3.         -0.55175185  5.         -1.0776101 ]
+            #   [-1.0680687  -1.9896201   8.          9.        ]]
+            #  [[ 1.         -0.5238267  -0.65515125  4.        ]
+            #   [-1.3766339   6.          7.         -2.3465784 ]
+            #   [ 6.          7.          8.          9.        ]]]]
+    """
+
+    if not in_dynamic_mode():
+        check_variable_and_dtype(x, 'X', ['float16', 'float32', 'float64'],
+                                 'rrelu')
+
+    if not isinstance(lower, float) or not isinstance(upper, float):
+        raise TypeError(
+            "The lower and upper values must be float type. Received: lower {}, upper {}.".
+            format(lower, upper))
+
+    if lower < 0 or lower > 1:
+        raise ValueError(
+            "The lower value must be no less than zero or greater than one. Received: {}.".
+            format(lower))
+
+    if upper < lower:
+        raise ValueError(
+            "The upper value must be greater than lower value. Received: lower {}, upper {}.".
+            format(lower, upper))
+
+    if upper > 1:
+        raise ValueError(
+            "The upper value must be no greater than one. Received: {}.".format(
+                upper))
+
+    is_test = not training
+
+    if _in_legacy_dygraph():
+        out, noise = _C_ops.rrelu(x, 'lower', lower, 'upper', upper, 'is_test',
+                                  is_test)
+        return out
+
+    helper = LayerHelper('rrelu', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    noise = helper.create_variable_for_type_inference(dtype=x.dtype)
+    attrs = {'lower': lower, 'upper': upper, 'is_test': is_test}
+    helper.append_op(
+        type='rrelu',
+        inputs={"X": x},
+        outputs={"Out": out,
+                 "Noise": noise},
+        attrs=attrs)
+    return out
+
+
 def relu(x, name=None):
     """
     relu activation.
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index fe37b8fb97c3d..7fed1dbb487fa 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -21,7 +21,6 @@
 from paddle.static import Variable
 from ...fluid import dygraph_utils
 # TODO: define the common functions to build a neural network  
-from ...fluid.layers import unfold  # noqa: F401
 from ...tensor.manipulation import squeeze
 from ...tensor.manipulation import unsqueeze
 from ...tensor import clip
@@ -31,8 +30,6 @@
 from ...fluid.framework import _varbase_creator, _in_legacy_dygraph, in_dygraph_mode, _non_static_mode
 
 from ...fluid import dygraph_utils
-from ...fluid import layers
-from ...fluid.data_feeder import check_variable_and_dtype
 
 from paddle import _C_ops
 from paddle.framework import in_dynamic_mode
@@ -44,6 +41,135 @@
 __all__ = []
 
 
+def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
+    r"""
+
+    This op returns a col buffer of sliding local blocks of input x, also known
+    as im2col for batched 2D image tensors. For each block under the convolution filter,
+    all element will be rearranged as a column. While the convolution filter sliding over
+    the input feature map, a series of such columns will be formed.
+
+    For each input :math:`x` with shape [N, C, H, W], the output shape [N, Cout, Lout]
+    can be calculated as following.
+
+    .. math::
+
+        dkernel[0] &= dilations[0] \times (kernel\_sizes[0] - 1) + 1
+
+        dkernel[1] &= dilations[1] \times (kernel\_sizes[1] - 1) + 1
+
+        hout &= \frac{H + paddings[0] + paddings[2] - dkernel[0]}{strides[0]} + 1
+
+        wout &= \frac{W + paddings[1] + paddings[3] - dkernel[1]}{strides[1]} + 1
+
+        Cout &= C \times kernel\_sizes[0] \times kernel\_sizes[1]
+
+        Lout &= hout \times wout
+
+
+    Parameters:
+        x(Tensor):              4-D Tensor, input tensor of format [N, C, H, W],
+                                  data type can be float32 or float64
+        kernel_sizes(int|list):   The size of convolution kernel, should be [k_h, k_w]
+                                  or an integer k treated as [k, k].
+        strides(int|list):        The strides, should be [stride_h, stride_w]
+                                  or an integer stride treated as [sride, stride].
+                                  For default, strides will be [1, 1].
+        paddings(int|list):       The paddings of each dimension, should be
+                                  [padding_top, padding_left, padding_bottom, padding_right]
+                                  or [padding_h, padding_w] or an integer padding.
+                                  If [padding_h, padding_w] was given, it will expanded to
+                                  [padding_h, padding_w, padding_h, padding_w]. If an integer
+                                  padding was given, [padding, padding, padding, padding] will
+                                  be used. For default, paddings will be [0, 0, 0, 0]
+        dilations(int|list):      the dilations of convolution kernel, should be
+                                  [dilation_h, dilation_w], or an integer dilation treated as
+                                  [dilation, dilation]. For default, it will be [1, 1].
+        name(str, optional): The default value is None.
+                             Normally there is no need for user to set this property.
+                             For more information, please refer to :ref:`api_guide_Name`
+
+
+    Returns:
+        The tensor corresponding to the sliding local blocks.
+        The output shape is [N, Cout, Lout] as decriabled above.
+        Cout is the  total number of values within each block,
+        and Lout is the total number of such blocks.
+        The data type of output is the same as the input :math:`x`
+
+    Return Type:
+        Tensor
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+
+            x = paddle.randn((100,3,224,224))
+            y = F.unfold(x, [3, 3], 1, 1, 1)
+    """
+
+    helper = LayerHelper("unfold", **locals())
+
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'unfold')
+
+    assert len(x.shape) == 4, \
+            "input should be the format of [N, C, H, W]"
+
+    if isinstance(kernel_sizes, int):
+        kernel_sizes = [kernel_sizes, kernel_sizes]
+    else:
+        assert isinstance(kernel_sizes, list) and (len(kernel_sizes) == 2), \
+            "kernel_sizes should either be an integer or a list of two integers"
+
+    if isinstance(strides, int):
+        strides = [strides, strides]
+    else:
+        assert isinstance(strides, list) and (len(strides) == 2), \
+            "strides should either be an integer or a list of two integers"
+
+    if isinstance(dilations, int):
+        dilations = [dilations, dilations]
+    else:
+        assert isinstance(dilations, list) and (len(dilations) == 2), \
+            "dilations should either be an integer or a list of two integers"
+
+    if isinstance(paddings, int):
+        paddings = [paddings] * 4
+    elif isinstance(paddings, list):
+        if len(paddings) == 2:
+            paddings = paddings * 2
+        elif len(paddings) == 4:
+            pass
+        else:
+            raise ValueError(
+                "paddings should either be an integer or a list of 2 or 4 integers"
+            )
+    else:
+        raise ValueError(
+            "Unexpected type of paddings, it should be either an integer or a list"
+            "of 2 or 4 integers")
+
+    if in_dygraph_mode():
+        return _C_ops.final_state_unfold(x, kernel_sizes, strides, paddings,
+                                         dilations)
+
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type="unfold",
+        inputs={"X": x},
+        outputs={"Y": out},
+        attrs={
+            "kernel_sizes": kernel_sizes,
+            "strides": strides,
+            "paddings": paddings,
+            "dilations": dilations
+        })
+    return out
+
+
 def interpolate(x,
                 size=None,
                 scale_factor=None,
@@ -1295,7 +1421,23 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
 
     if mode == "constant" and isinstance(pad, (
             list, tuple)) and len(pad) == x_dim * 2:
-        return layers.pad(x, pad, pad_value=value)
+        paddings = pad
+        pad_value = value
+        check_variable_and_dtype(x, 'x', [
+            'float16', 'float32', 'float64', 'int32', 'int64', 'complex64',
+            'complex128'
+        ], "pad")
+
+        helper = LayerHelper('pad', **locals())
+        dtype = helper.input_dtype(input_param_name='x')
+        out = helper.create_variable_for_type_inference(dtype)
+        helper.append_op(
+            type='pad',
+            inputs={'X': x},
+            outputs={'Out': out},
+            attrs={'paddings': paddings,
+                   'pad_value': float(pad_value)})
+        return out
 
     assert x_dim in [
         3, 4, 5
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 6c7f09091ff3c..419014daf64e4 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -138,6 +138,35 @@ def _conv_nd(x,
                 return _C_ops.final_state_add(pre_bias, bias)
         else:
             return pre_bias
+
+    if in_dygraph_mode() and op_type == "depthwise_conv2d":
+        pre_bias = _C_ops.final_state_depthwise_conv2d(
+            x, weight, stride, padding, padding_algorithm, groups, dilation,
+            data_format, False, -1, False, False)
+        if bias is not None:
+            channel_dim = channel_dim + len(
+                x.shape) if channel_dim < 0 else channel_dim
+            tmp_bias = _C_ops.final_state_reshape(
+                bias, bias.shape +
+                [1 for i in range(len(x.shape) - channel_dim - 1)])
+            return _C_ops.final_state_add(pre_bias, tmp_bias)
+        else:
+            return pre_bias
+
+    if in_dygraph_mode() and op_type == "conv3d":
+        pre_bias = _C_ops.final_state_conv3d(
+            x, weight, stride, padding, padding_algorithm, groups, dilation,
+            data_format, False, -1, False)
+        if bias is not None:
+            channel_dim = channel_dim + len(
+                x.shape) if channel_dim < 0 else channel_dim
+            tmp_bias = _C_ops.final_state_reshape(
+                bias, bias.shape +
+                [1 for i in range(len(x.shape) - channel_dim - 1)])
+            return _C_ops.final_state_add(pre_bias, tmp_bias)
+        else:
+            return pre_bias
+
     if in_dynamic_mode():
         attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation,
                  'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn',
diff --git a/python/paddle/nn/functional/extension.py b/python/paddle/nn/functional/extension.py
index 2483eab6c053a..5a6bf4c0fa650 100644
--- a/python/paddle/nn/functional/extension.py
+++ b/python/paddle/nn/functional/extension.py
@@ -21,8 +21,12 @@
 from ...tensor.creation import assign
 from ...fluid import dygraph_utils
 from ...tensor.layer_function_generator import templatedoc
-from ...fluid.layers.sequence_lod import sequence_mask  #noqa: F401
 from paddle import in_dynamic_mode
+from paddle import _C_ops
+from ...fluid.framework import _non_static_mode, _in_legacy_dygraph, in_dygraph_mode
+from ...fluid.data_feeder import check_variable_and_dtype, check_type
+from ...framework import core
+from ...common_ops_import import convert_np_dtype_to_dtype_
 
 __all__ = []
 
@@ -140,3 +144,240 @@ def __check_input(input, offset, dim1, dim2):
         outputs={'Out': [out]})
     out.stop_gradient = True
     return out
+
+
+def sequence_mask(x, maxlen=None, dtype='int64', name=None):
+    r"""
+    **SequenceMask Layer**
+
+    This layer outputs a mask according to the input :code:`x` and
+    :code:`maxlen` with data type of :code:`dtype`.
+
+    Supposing :code:`x` is a Tensor with shape [d_1, d_2, ..., d_n], the
+    :code:`y` is a mask with shape [d_1, d_2, ..., d_n, maxlen], where:
+
+    .. math::
+
+        y(i_1, i_2,..., i_n, j) = (j < x(i_1, i_2,..., i_n))
+
+    .. code-block:: text
+
+        Case:
+
+        Consider input:
+            x = [3, 1, 1, 0]    max_len = 4
+
+        then we get out:
+            mask = [[1, 1, 1, 0],
+                    [1, 0, 0, 0],
+                    [1, 0, 0, 0],
+                    [0, 0, 0, 0]]
+
+    Args:
+        x (Variable): Input tensor of sequence_mask layer, \
+            whose elements are integers less than :code:`maxlen`. \
+            Tensor or LodTensor with shape [d_1, d_2, ..., d_n].
+        maxlen (int, optional): Maximum length of the sequence. If :code:`maxlen` \
+                           is None, it would be replace with :math:`max(x)`.
+        dtype (np.dtype|paddle.dtype|str, optional): Data type of the output, \
+             ``int64`` by default.
+        name(str, optional): For detailed information, please refer \
+            to :ref:`api_guide_Name`. Usually name is no need to set and \
+            None by default.
+
+    Returns: The output sequence mask. Tensor with shape [d_1, d_2, ..., d_n, maxlen] \
+            and data type of :code:`dtype`. The data type should be bool, float32, float64, int8, \
+            int32 or int64.
+
+    Return Type: Tensor
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            lengths = paddle.to_tensor([10, 9, 8])
+            mask = paddle.nn.functional.sequence_mask(lengths)
+
+            print(mask.numpy())
+            # [[1 1 1 1 1 1 1 1 1 1]
+            #  [1 1 1 1 1 1 1 1 1 0]
+            #  [1 1 1 1 1 1 1 1 0 0]]
+
+    """
+
+    if in_dygraph_mode():
+        if not isinstance(dtype, core.VarDesc.VarType):
+            dtype = convert_np_dtype_to_dtype_(dtype)
+        if maxlen is not None:
+            if isinstance(maxlen, core.eager.Tensor):
+                attrs = ('out_dtype', dtype)
+                out = _C_ops.sequence_mask(x, maxlen, *attrs)
+            else:
+                attrs = ('out_dtype', dtype, 'maxlen', maxlen)
+                out = _C_ops.sequence_mask(x, None, *attrs)
+            out.stop_gradient = True
+            return out
+
+    helper = LayerHelper('sequence_mask', **locals())
+    out = helper.create_variable_for_type_inference(dtype=dtype)
+
+    inputs = {'X': [x]}
+    attrs = {'out_dtype': out.dtype}
+    if maxlen is not None:
+        if isinstance(maxlen, Variable):
+            inputs['MaxLenTensor'] = maxlen
+        else:
+            attrs['maxlen'] = maxlen
+
+    helper.append_op(
+        type='sequence_mask', inputs=inputs, outputs={'Y': out}, attrs=attrs)
+
+    out.stop_gradient = True
+    return out
+
+
+def gather_tree(ids, parents):
+    r"""
+    To be used after beam search. After beam search, we get selected ids at
+    each time step and the corresponding parents in the search tree. Both ids
+    and parents have the layout :attr:`[max_time, batch_size, beam_size]`. Then
+    :attr:`gather_tree` is used to backtrace from the last time step and
+    generate the full sequences by collecting selected ids.
+
+    Here is an example:
+
+    .. code-block:: text
+
+            Given:
+                ids = [[[2 2]
+                        [6 1]]
+                       [[3 9]
+                        [6 1]]
+                       [[0 1]
+                        [9 0]]]
+                parents = [[[0 0]
+                            [1 1]]
+                           [[1 0]
+                            [1 0]]
+                           [[0 0]
+                            [0 1]]]
+
+            Then:
+                gather_tree(ids, parents)
+                         = [[[2 2]
+                             [1 6]]
+                            [[3 3]
+                             [6 1]]
+                            [[0 1]
+                             [9 0]]]
+
+    Args:
+        ids(Tensor): A Tensor with shape :attr:`[length, batch_size, beam_size]`
+            and data type :attr:`int32` or :attr:`int64`. It contains the selected
+            ids of all time steps.
+        parents(Tensor): A Tensor with the same shape and data type as :attr:`ids`,
+            It contains the parents corresponding to selected ids when searching
+            among beams.
+
+    Returns:
+            A Tensor with the same shape and data type as :attr:`ids`. \
+            It contains the full sequences. The sequences are collected from \
+            :attr:`ids` by backtracing according to :attr:`parents`.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            ids = paddle.to_tensor([[[2, 2], [6, 1]], [[3, 9], [6, 1]], [[0, 1], [9, 0]]])
+
+            parents = paddle.to_tensor([[[0, 0], [1, 1]], [[1, 0], [1, 0]], [[0, 0], [0, 1]]])
+
+            final_sequences = paddle.nn.functional.gather_tree(ids, parents)
+            # [[[2, 2], [1, 6]], [[3, 3], [6, 1]], [[0, 1], [9, 0]]]
+
+    """
+    if in_dygraph_mode():
+        return _C_ops.final_state_gather_tree(ids, parents)
+    else:
+        if _in_legacy_dygraph():
+            return _C_ops.gather_tree(ids, parents)
+        else:
+            helper = LayerHelper('gather_tree', **locals())
+            check_variable_and_dtype(ids, 'ids', ['int32', 'int64'],
+                                     'gather_tree')
+            check_variable_and_dtype(parents, 'parents', ['int32', 'int64'],
+                                     'gather_tree')
+            out = helper.create_variable_for_type_inference(dtype=ids.dtype)
+
+            helper.append_op(
+                type="gather_tree",
+                inputs={"Ids": ids,
+                        "Parents": parents},
+                outputs={"Out": out})
+
+            return out
+
+
+@templatedoc()
+def temporal_shift(x, seg_num, shift_ratio=0.25, name=None, data_format="NCHW"):
+    """
+
+    **Temporal Shift Operator**
+
+    ${comment}
+
+    Args:
+        x(Tensor): ${x_comment}
+        seg_num(int): ${seg_num_comment}
+        shift_ratio(float): ${shift_ratio_comment}
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+        data_format(str, optional): Data format that specifies the layout of input.
+            It can be "NCHW" or "NHWC". Default: "NCHW".
+
+    Returns:
+        out(Tensor): The temporal shifting result is a tensor with the
+        same shape and same data type as the input.
+
+    Raises:
+        TypeError: seg_num must be int type.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+
+            input = paddle.randn([6, 4, 2, 2])
+            out = F.temporal_shift(x=input, seg_num=2, shift_ratio=0.2)
+    """
+    if data_format not in ["NCHW", "NHWC"]:
+        raise ValueError("Attr(data_format) should be 'NCHW' or 'NHWC'. "
+                         "Received Attr(data_format): {}.".format(data_format))
+    if _non_static_mode():
+        return _C_ops.temporal_shift(x, 'seg_num', seg_num, 'shift_ratio',
+                                     shift_ratio, 'data_format', data_format)
+
+    helper = LayerHelper("temporal_shift", **locals())
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'temporal_shift')
+    check_type(seg_num, 'seg_num', int, 'temporal_shift')
+    check_type(shift_ratio, 'shift_ratio', float, 'temporal_shift')
+
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+    if not isinstance(seg_num, int):
+        raise TypeError("seg_num must be int type.")
+
+    helper.append_op(
+        type="temporal_shift",
+        inputs={"X": x},
+        outputs={"Out": out},
+        attrs={
+            "seg_num": seg_num,
+            "shift_ratio": shift_ratio,
+            "data_format": data_format
+        })
+    return out
diff --git a/python/paddle/nn/functional/input.py b/python/paddle/nn/functional/input.py
index cfbf015ffa05f..92b3a7054d467 100644
--- a/python/paddle/nn/functional/input.py
+++ b/python/paddle/nn/functional/input.py
@@ -200,7 +200,9 @@ def embedding(x, weight, padding_idx=None, sparse=False, name=None):
         raise ValueError("padding_idx must be within [-{}, {})".format(
             weight.shape[0], weight.shape[0]))
 
-    if in_dynamic_mode():
+    if in_dygraph_mode():
+        return _C_ops.final_state_embedding(x, weight, padding_idx, sparse)
+    elif _in_legacy_dygraph():
         return _C_ops.lookup_table_v2(
             weight, x, 'is_sparse', sparse, 'is_distributed', False,
             'remote_prefetch', False, 'padding_idx', padding_idx)
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 9e427d11dbbad..36b360d98ae32 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -21,15 +21,7 @@
 import paddle
 import paddle.fluid as fluid
 from ...fluid.layers.nn import _elementwise_op_in_dygraph
-from ...fluid.layers import dice_loss  # noqa: F401
-from ...fluid.layers import log_loss  # noqa: F401
-from ...fluid.layers import npair_loss  # noqa: F401
 from ...tensor.manipulation import reshape
-from ...fluid.layers import softmax_with_cross_entropy as fluid_softmax_with_cross_entropy
-from ...fluid.layers import square_error_cost  # noqa: F401
-
-from ...fluid.layers import edit_distance  # noqa: F401
-from ...fluid.layers import huber_loss
 from ...fluid.layer_helper import LayerHelper
 from ...fluid.framework import _varbase_creator
 from ...static import Variable
@@ -41,6 +33,518 @@
 __all__ = []
 
 
+def dice_loss(input, label, epsilon=0.00001, name=None):
+    r"""
+
+    Dice loss for comparing the similarity between the input predictions and the label.
+    This implementation is for binary classification, where the input is sigmoid
+    predictions of each pixel, usually used for segmentation task. The dice loss can
+    be defined as the following equation:
+
+    .. math::
+
+        dice\_loss &= 1 - \frac{2 * intersection\_area}{total\_area} \\
+                  &= \frac{(total\_area - intersection\_area) - intersection\_area}{total\_area} \\
+                  &= \frac{(union\_area - intersection\_area)}{total\_area}
+
+
+    Parameters:
+        input (Tensor): Tensor, rank>=2, shape is :math:`[N_1, N_2, ..., N_k, D]`, where :math:`N_1` is
+                          the batch_size, :math:`D` is the number of categories. It is usually the output
+                          predictions of sigmoid activation. The data type can be float32 or float64.
+        label (Tensor): Tensor, the groud truth with the same rank as input, shape is :math:`[N_1, N_2, ..., N_k, 1]`.
+                          where :math:`N_1` is the batch_size. The data type can be int32 or int64.
+        epsilon (float): The epsilon will be added to the numerator and denominator.
+                         If both input and label are empty, it makes sure dice is 1.
+                         Default: 0.00001
+        name(str, optional): The default value is None.
+                             Normally there is no need for user to set this property.
+                             For more information, please refer to :ref:`api_guide_Name`
+
+    Returns:
+        Tensor, which shape is [1], data type is the same as `input` .
+
+    Example:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+
+            x = paddle.randn((3,224,224,2))
+            label = paddle.randint(high=2, shape=(3,224,224,1))
+            predictions = F.softmax(x)
+            loss = F.dice_loss(input=predictions, label=label)
+    """
+    assert input.dtype in (paddle.float32, paddle.float64)
+    assert label.dtype in (paddle.int32, paddle.int64)
+    assert len(input.shape) >= 2, \
+        "The rank of input should be greater than or equal to 2."
+    assert len(input.shape) == len(label.shape), (
+        "The rank of input and label should be equal, "
+        "but received input: %d, label: %d." %
+        (len(input.shape), len(label.shape)))
+    assert label.shape[-1] == 1, ("The last dimension of label should be 1, "
+                                  "but received %d." % label.shape[-1])
+    assert input.shape[:-1] == label.shape[:-1], (
+        "All dimensions should be equal except the last one.")
+    assert input.numel() > 0 and label.numel() > 0, \
+        "Any dimension of input and label cannot be equal to 0."
+
+    label = paddle.squeeze(label, [-1])
+    label = paddle.nn.functional.one_hot(label, input.shape[-1])
+    reduce_dim = list(range(1, len(input.shape)))
+    inse = paddle.sum(input * label, axis=reduce_dim)
+    dice_denominator = paddle.sum(input, axis=reduce_dim) + paddle.sum(
+        label, axis=reduce_dim)
+    dice_score = 1 - inse * 2 / (dice_denominator + epsilon)
+    return paddle.mean(dice_score)
+
+
+def log_loss(input, label, epsilon=1e-4, name=None):
+    r"""
+
+    **Negative Log Loss Layer**
+
+    This layer accepts input predictions and target label and returns the
+    negative log loss.
+
+    .. math::
+
+        Out = -label * \log{(input + \epsilon)}
+              - (1 - label) * \log{(1 - input + \epsilon)}
+
+    Args:
+        input (Tensor|list):  A 2-D tensor with shape [N x 1], where N is the
+                                batch size. This input is a probability computed
+                                by the previous operator. Data type float32.
+        label (Tensor|list):  The ground truth which is a 2-D tensor with
+                                shape [N x 1], where N is the batch size.
+                                Data type float32.
+        epsilon (float, optional): A small number for numerical stability. Default 1e-4.
+        name(str|None): For detailed information, please refer to
+            :ref:`api_guide_Name` . Usually name is no need to set and None by default.
+
+    Returns:
+        Tensor, which shape is [N x 1], data type is float32.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import paddle.nn.functional as F
+
+          label = paddle.randn((10,1))
+          prob = paddle.randn((10,1))
+          cost = F.log_loss(input=prob, label=label)
+    """
+    if in_dygraph_mode():
+        return _C_ops.final_state_log_loss(input, label, epsilon)
+
+    helper = LayerHelper('log_loss', **locals())
+    check_variable_and_dtype(input, 'input', ['float32'], 'log_loss')
+    check_variable_and_dtype(label, 'label', ['float32'], 'log_loss')
+
+    loss = helper.create_variable_for_type_inference(dtype=input.dtype)
+
+    helper.append_op(
+        type='log_loss',
+        inputs={'Predicted': [input],
+                'Labels': [label]},
+        outputs={'Loss': [loss]},
+        attrs={'epsilon': epsilon})
+    return loss
+
+
+def fluid_softmax_with_cross_entropy(logits,
+                                     label,
+                                     soft_label=False,
+                                     ignore_index=-100,
+                                     numeric_stable_mode=True,
+                                     return_softmax=False,
+                                     axis=-1):
+    r"""
+
+    This operator implements the cross entropy loss function with softmax. This function 
+    combines the calculation of the softmax operation and the cross entropy loss function 
+    to provide a more numerically stable gradient.
+
+    Because this operator performs a softmax on logits internally, it expects
+    unscaled logits. This operator should not be used with the output of
+    softmax operator since that would produce incorrect results.
+
+    When the attribute :attr:`soft_label` is set :attr:`False`, this operators 
+    expects mutually exclusive hard labels, each sample in a batch is in exactly 
+    one class with a probability of 1.0. Each sample in the batch will have a 
+    single label.
+
+    The equation is as follows:
+
+    1) Hard label (one-hot label, so every sample has exactly one class)
+
+    .. math::
+
+        loss_j =  -\\text{logits}_{label_j} +
+        \\log\\left(\\sum_{i=0}^{K}\\exp(\\text{logits}_i)\\right), j = 1,..., K
+
+    2) Soft label (each sample can have a distribution over all classes)
+
+    .. math::
+
+        loss_j =  -\\sum_{i=0}^{K}\\text{label}_i
+        \\left(\\text{logits}_i - \\log\\left(\\sum_{i=0}^{K}
+        \\exp(\\text{logits}_i)\\right)\\right), j = 1,...,K
+
+    3) If :attr:`numeric_stable_mode` is :attr:`True`, softmax is calculated first by:
+
+    .. math::
+
+        max_j &= \\max_{i=0}^{K}{\\text{logits}_i}
+
+        log\\_max\\_sum_j &= \\log\\sum_{i=0}^{K}\\exp(logits_i - max_j)
+
+        softmax_j &= \\exp(logits_j - max_j - {log\\_max\\_sum}_j)
+
+    and then cross entropy loss is calculated by softmax and label.
+
+    Args:
+        logits (Tensor): A multi-dimension ``Tensor`` , and the data type is float32 or float64. The input tensor of unscaled log probabilities.
+        label (Tensor): The ground truth  ``Tensor`` , data type is the same
+            as the ``logits`` . If :attr:`soft_label` is set to :attr:`True`, 
+            Label is a ``Tensor``  in the same shape with :attr:`logits`. 
+            If :attr:`soft_label` is set to :attr:`True`, Label is a ``Tensor`` 
+            in the same shape with :attr:`logits` expect shape in dimension :attr:`axis` as 1.
+        soft_label (bool, optional): A flag to indicate whether to interpretant the given
+            labels as soft labels. Default False.
+        ignore_index (int, optional): Specifies a target value that is ignored and does
+                                      not contribute to the input gradient. Only valid
+                                      if :attr:`soft_label` is set to :attr:`False`. 
+                                      Default: kIgnoreIndex(-100).
+        numeric_stable_mode (bool, optional): A flag to indicate whether to use a more
+                                              numerically stable algorithm. Only valid
+                                              when :attr:`soft_label` is :attr:`False` 
+                                              and GPU is used. When :attr:`soft_label` 
+                                              is :attr:`True` or CPU is used, the 
+                                              algorithm is always numerically stable.
+                                              Note that the speed may be slower when use
+                                              stable algorithm. Default: True.
+        return_softmax (bool, optional): A flag indicating whether to return the softmax
+                                         along with the cross entropy loss. Default: False.
+        axis (int, optional): The index of dimension to perform softmax calculations. It 
+                              should be in range :math:`[-1, rank - 1]`, while :math:`rank`
+                              is the rank of input :attr:`logits`. Default: -1.
+
+    Returns:
+        ``Tensor`` or Tuple of two ``Tensor`` : Return the cross entropy loss if \
+                                                    `return_softmax` is False, otherwise the tuple \
+                                                    (loss, softmax), softmax is in the same shape \
+                                                    with input logits and cross entropy loss is in \
+                                                    the same shape with input logits except shape \
+                                                    in dimension :attr:`axis` as 1.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            data = np.random.rand(128).astype("float32")
+            label = np.random.rand(1).astype("int64")
+            data = paddle.to_tensor(data)
+            label = paddle.to_tensor(label)
+            linear = paddle.nn.Linear(128, 100)
+            x = linear(data)
+            out = paddle.nn.functional.softmax_with_cross_entropy(logits=x, label=label)
+            print(out)
+    """
+    if _non_static_mode():
+        if core.is_compiled_with_npu():
+            softmax, backprop, loss = _C_ops.softmax_with_cross_entropy(
+                logits, label, 'soft_label', soft_label, 'ignore_index',
+                ignore_index, 'numeric_stable_mode', numeric_stable_mode,
+                'axis', axis)
+        else:
+            if in_dygraph_mode():
+                softmax, loss = _C_ops.final_state_cross_entropy_with_softmax(
+                    logits, label, soft_label, True, numeric_stable_mode,
+                    ignore_index, axis)
+            if _in_legacy_dygraph():
+                softmax, loss = _C_ops.softmax_with_cross_entropy(
+                    logits, label, 'soft_label', soft_label, 'ignore_index',
+                    ignore_index, 'numeric_stable_mode', numeric_stable_mode,
+                    'axis', axis)
+        if not return_softmax:
+            return loss
+        else:
+            return loss, softmax
+
+    attrs = {
+        'soft_label': soft_label,
+        'ignore_index': ignore_index,
+        'numeric_stable_mode': numeric_stable_mode,
+        'axis': axis
+    }
+    helper = LayerHelper('softmax_with_cross_entropy', **locals())
+    softmax = helper.create_variable_for_type_inference(dtype=logits.dtype)
+    loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
+
+    outputs = {'Softmax': softmax, 'Loss': loss}
+    if core.is_compiled_with_npu() or core.is_compiled_with_mlu():
+        backprop = helper.create_variable_for_type_inference(dtype=logits.dtype)
+        outputs['Backprop'] = backprop
+    helper.append_op(
+        type='softmax_with_cross_entropy',
+        inputs={'Logits': logits,
+                'Label': label},
+        outputs=outputs,
+        attrs=attrs)
+
+    if return_softmax:
+        return loss, softmax
+
+    return loss
+
+
+def npair_loss(anchor, positive, labels, l2_reg=0.002):
+    """ 
+  
+    Npair loss requires paired data. Npair loss has two parts: the first part is L2
+    regularizer on the embedding vector; the second part is cross entropy loss which
+    takes the similarity matrix of anchor and positive as logits.
+  
+    For more information, please refer to:
+    `Improved Deep Metric Learning with Multi class N pair Loss Objective <http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/papers/nips16_npairmetriclearning.pdf>`_
+  
+    Args:
+      anchor(Tensor): embedding vector for the anchor image. shape=[batch_size, embedding_dims], 
+                        the data type is float32 or float64.
+      positive(Tensor): embedding vector for the positive image. shape=[batch_size, embedding_dims], 
+                        the data type is float32 or float64.
+      labels(Tensor): 1-D tensor. shape=[batch_size], the data type is float32 or float64 or int64.
+      l2_reg(float32): L2 regularization term on embedding vector, default: 0.002.
+
+  
+    Returns:
+      A Tensor representing the npair loss, the data type is the same as anchor, the shape is [1].
+  
+    Examples:
+
+      .. code-block:: python
+  
+          import paddle
+          
+          DATATYPE = "float32"
+  
+          anchor = paddle.rand(shape=(18, 6), dtype=DATATYPE)
+          positive = paddle.rand(shape=(18, 6), dtype=DATATYPE)
+          labels = paddle.rand(shape=(18,), dtype=DATATYPE)
+          
+          npair_loss = paddle.nn.functional.npair_loss(anchor, positive, labels, l2_reg = 0.002)
+          print(npair_loss)
+  
+    """
+    check_variable_and_dtype(anchor, 'anchor', ['float32', 'float64'],
+                             'npair_loss')
+    check_variable_and_dtype(positive, 'positive', ['float32', 'float64'],
+                             'positive')
+    check_variable_and_dtype(labels, 'labels', ['float32', 'float64', 'int64'],
+                             'labels')
+    Beta = 0.25
+    batch_size = labels.shape[0]
+
+    labels = paddle.reshape(labels, shape=[batch_size, 1])
+    labels = paddle.tile(labels, repeat_times=[1, batch_size])
+
+    labels = paddle.equal(
+        labels, paddle.transpose(
+            labels, perm=[1, 0])).astype('float32')
+    labels = labels / paddle.sum(labels, axis=1, keepdim=True)
+
+    l2loss = paddle.mean(paddle.sum(paddle.square(anchor), 1)) \
+             + paddle.mean(paddle.sum(paddle.square(positive), 1))
+    l2loss = l2loss * Beta * l2_reg
+
+    similarity_matrix = paddle.matmul(
+        anchor, positive, transpose_x=False, transpose_y=True)
+    softmax_ce = fluid_softmax_with_cross_entropy(
+        logits=similarity_matrix, label=labels, soft_label=True)
+    cross_entropy = paddle.sum(labels * softmax_ce, 0)
+    celoss = paddle.mean(cross_entropy)
+
+    return l2loss + celoss
+
+
+def square_error_cost(input, label):
+    r"""
+
+    This op accepts input predictions and target label and returns the
+    squared error cost.
+
+    For predictions label, and target label, the equation is:
+
+    .. math::
+
+        Out = (input - label)^2
+
+    Parameters:
+        input (Tensor): Input tensor, the data type should be float32.
+        label (Tensor): Label tensor, the data type should be float32.
+
+    Returns:
+        The tensor storing the element-wise squared error \
+                  difference between input and label.
+
+    Return type: Tensor.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            input = paddle.to_tensor([1.1, 1.9])
+            label = paddle.to_tensor([1.0, 2.0])
+            output = paddle.nn.functional.square_error_cost(input, label)
+            print(output)
+            # [0.01, 0.01]
+
+    """
+    if _non_static_mode():
+        minus_out = _C_ops.elementwise_sub(input, label)
+        square_out = _C_ops.square(minus_out)
+        return square_out
+
+    check_variable_and_dtype(input, "input", ['float32', 'float64'],
+                             'square_error_cost')
+    check_variable_and_dtype(label, "label", ['float32', 'float64'],
+                             'square_error_cost')
+    helper = LayerHelper('square_error_cost', **locals())
+    minus_out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    helper.append_op(
+        type='elementwise_sub',
+        inputs={'X': [input],
+                'Y': [label]},
+        outputs={'Out': [minus_out]})
+
+    square_out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    helper.append_op(
+        type='square', inputs={'X': [minus_out]},
+        outputs={'Out': [square_out]})
+    return square_out
+
+
+def edit_distance(input,
+                  label,
+                  normalized=True,
+                  ignored_tokens=None,
+                  input_length=None,
+                  label_length=None):
+    """
+    This op computes the edit distances, also called Levenshtein distance, between a batch of
+    hypothesis strings and their references. It measures how dissimilar two strings are by counting
+    the minimum number of operations to transform one string into another.
+    The operations include insertion, deletion, and substitution.
+
+    For example, given hypothesis string A = "kitten" and reference
+    B = "sitting", A will be transformed into B
+    at least after two substitutions and one insertion:
+
+    "kitten" -> "sitten" -> "sittin" -> "sitting"
+
+    So the edit distance between A and B is 3.
+
+    The input is a Tensor, the input_length and label_length should be supported.
+
+    The `batch_size` of labels should be same as `input`.
+
+    The output include the edit distance value between every pair of input and related label, and the number of sequence.
+    If Attr(normalized) is true,
+    the edit distance value will be divided by the length of label.
+
+    Parameters:
+        input(Tensor): The input tensor, its rank should be equal to 2 and its data type should be int64.
+        label(Tensor): The label tensor, its rank should be equal to 2 and its data type should be int64.
+        normalized(bool, default True): Indicated whether to normalize the edit distance.
+        ignored_tokens(list<int>, default None): Tokens that will be removed before
+                                     calculating edit distance.
+        input_length(Tensor): The length for each sequence in `input` if it's of Tensor type, it should have shape `(batch_size, )` and its data type should be int64.
+        label_length(Tensor): The length for each sequence in `label` if it's of Tensor type, it should have shape `(batch_size, )` and its data type should be int64.
+        NOTE: To be avoid unexpected result, the value of every elements in input_length and label_length should be equal to the value of the second dimension of input and label. For example, The input: [[1,2,3,4],[5,6,7,8],[9,10,11,12]], the shape of input is [3,4] and the input_length should be [4,4,4]
+        NOTE: This Api is different from fluid.metrics.EditDistance
+
+    Returns:
+	Tuple:
+
+        distance(Tensor): edit distance result, its data type is float32, and its shape is (batch_size, 1).
+        sequence_num(Tensor): sequence number, its data type is float32, and its shape is (1,).
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+
+            input = paddle.to_tensor([[1,2,3],[4,5,6],[4,4,4],[1,1,1]], dtype='int64')
+            label = paddle.to_tensor([[1,3,4,1],[4,5,8,1],[7,7,7,1],[1,1,1,1]], dtype='int64')
+            input_len = paddle.to_tensor([3,3,3,3], dtype='int64')
+            label_len = paddle.to_tensor([4,4,4,4], dtype='int64')
+
+            distance, sequence_num = F.loss.edit_distance(input=input, label=label, input_length=input_len, label_length=label_len, normalized=False)
+
+            # print(distance)
+            # [[3.]
+            #  [2.]
+            #  [4.]
+            #  [1.]]
+            # if set normalized to True
+            # [[0.75]
+            #  [0.5 ]
+            #  [1.  ]
+            #  [0.25]
+            #
+            # print(sequence_num)
+            # [4]
+
+    """
+    check_variable_and_dtype(input, 'input', ['int64'], 'edit_distance')
+    check_variable_and_dtype(label, 'label', ['int64'], 'edit_distance')
+    helper = LayerHelper("edit_distance", **locals())
+
+    # remove some tokens from input and labels
+    if ignored_tokens is not None and len(ignored_tokens) > 0:
+        erased_input = helper.create_variable_for_type_inference(dtype="int64")
+        erased_label = helper.create_variable_for_type_inference(dtype="int64")
+
+        helper.append_op(
+            type="sequence_erase",
+            inputs={"X": [input]},
+            outputs={"Out": [erased_input]},
+            attrs={"tokens": ignored_tokens})
+        input = erased_input
+
+        helper.append_op(
+            type="sequence_erase",
+            inputs={"X": [label]},
+            outputs={"Out": [erased_label]},
+            attrs={"tokens": ignored_tokens})
+        label = erased_label
+
+    this_inputs = {"Hyps": [input], "Refs": [label]}
+    if input_length is not None and label_length is not None:
+        this_inputs['HypsLength'] = [input_length]
+        this_inputs['RefsLength'] = [label_length]
+
+    # edit distance op
+    edit_distance_out = helper.create_variable_for_type_inference(dtype="int64")
+    sequence_num = helper.create_variable_for_type_inference(dtype="int64")
+    helper.append_op(
+        type="edit_distance",
+        inputs=this_inputs,
+        outputs={"Out": [edit_distance_out],
+                 "SequenceNum": [sequence_num]},
+        attrs={"normalized": normalized})
+
+    return edit_distance_out, sequence_num
+
+
 def binary_cross_entropy(input, label, weight=None, reduction='mean',
                          name=None):
     """
@@ -138,10 +642,10 @@ def binary_cross_entropy(input, label, weight=None, reduction='mean',
             else:
                 return out
         else:
-            fluid.data_feeder.check_variable_and_dtype(
-                input, 'input', ['float32', 'float64'], 'binary_cross_entropy')
-            fluid.data_feeder.check_variable_and_dtype(
-                label, 'label', ['float32', 'float64'], 'binary_cross_entropy')
+            check_variable_and_dtype(input, 'input', ['float32', 'float64'],
+                                     'binary_cross_entropy')
+            check_variable_and_dtype(label, 'label', ['float32', 'float64'],
+                                     'binary_cross_entropy')
 
             sub_name = name if weight is None and reduction == 'none' else None
             helper = LayerHelper("binary_cross_entropy", name=sub_name)
@@ -288,12 +792,10 @@ def binary_cross_entropy_with_logits(logit,
         else:
             return out
 
-    fluid.data_feeder.check_variable_and_dtype(
-        logit, 'logit', ['float32', 'float64'],
-        'binary_cross_entropy_with_logits')
-    fluid.data_feeder.check_variable_and_dtype(
-        label, 'label', ['float32', 'float64'],
-        'binary_cross_entropy_with_logits')
+    check_variable_and_dtype(logit, 'logit', ['float32', 'float64'],
+                             'binary_cross_entropy_with_logits')
+    check_variable_and_dtype(label, 'label', ['float32', 'float64'],
+                             'binary_cross_entropy_with_logits')
     sigmoid_name = None
     if reduction == 'none' and pos_weight is None and weight is None:
         sigmoid_name = name
@@ -303,18 +805,17 @@ def binary_cross_entropy_with_logits(logit,
 
     one = paddle.full(shape=[1], fill_value=1.0, dtype=logit.dtype)
     if pos_weight is not None:
-        fluid.data_feeder.check_variable_and_dtype(
-            pos_weight, 'pos_weight', ['float32', 'float64'],
-            'binary_cross_entropy_with_logits')
+        check_variable_and_dtype(pos_weight, 'pos_weight',
+                                 ['float32', 'float64'],
+                                 'binary_cross_entropy_with_logits')
         log_weight = paddle.add(
             paddle.multiply(label, paddle.subtract(pos_weight, one)), one)
         pos_weight_name = name if reduction == 'none' and weight is None else None
         out = paddle.multiply(out, log_weight, name=pos_weight_name)
 
     if weight is not None:
-        fluid.data_feeder.check_variable_and_dtype(
-            weight, 'weight', ['float32', 'float64'],
-            'binary_cross_entropy_with_logits')
+        check_variable_and_dtype(weight, 'weight', ['float32', 'float64'],
+                                 'binary_cross_entropy_with_logits')
         weight_name = name if reduction == 'none' else None
         out = paddle.multiply(out, weight, name=weight_name)
 
@@ -519,12 +1020,26 @@ def smooth_l1_loss(input, label, reduction='mean', delta=1.0, name=None):
             output = paddle.nn.functional.smooth_l1_loss(input, label)
             print(output)
     """
-    fluid.data_feeder.check_variable_and_dtype(
-        input, 'input', ['float32', 'float64'], 'smooth_l1_loss')
-    fluid.data_feeder.check_variable_and_dtype(
-        label, 'label', ['float32', 'float64'], 'smooth_l1_loss')
+    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
+                             'smooth_l1_loss')
+    check_variable_and_dtype(label, 'label', ['float32', 'float64'],
+                             'smooth_l1_loss')
 
-    out = huber_loss(input=input, label=label, delta=delta)
+    if in_dygraph_mode():
+        out, residual = _C_ops.final_state_huber_loss(input, label, delta)
+    else:
+        helper = LayerHelper('huber_loss', **locals())
+        residual = helper.create_variable_for_type_inference(
+            dtype=helper.input_dtype())
+        out = helper.create_variable_for_type_inference(
+            dtype=helper.input_dtype())
+        helper.append_op(
+            type='huber_loss',
+            inputs={'X': input,
+                    'Y': label},
+            outputs={'Out': out,
+                     'Residual': residual},
+            attrs={'delta': delta})
 
     if reduction not in ['sum', 'mean', 'none']:
         raise ValueError(
@@ -615,12 +1130,12 @@ def margin_ranking_loss(input,
         return out
 
     helper = LayerHelper("margin_ranking_loss", **locals())
-    fluid.data_feeder.check_variable_and_dtype(
-        input, 'input', ['float32', 'float64'], 'margin_rank_loss')
-    fluid.data_feeder.check_variable_and_dtype(
-        other, 'other', ['float32', 'float64'], 'margin_rank_loss')
-    fluid.data_feeder.check_variable_and_dtype(
-        label, 'label', ['float32', 'float64'], 'margin_rank_loss')
+    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
+                             'margin_rank_loss')
+    check_variable_and_dtype(other, 'other', ['float32', 'float64'],
+                             'margin_rank_loss')
+    check_variable_and_dtype(label, 'label', ['float32', 'float64'],
+                             'margin_rank_loss')
 
     out = paddle.subtract(other, input)
     out = paddle.multiply(out, label)
@@ -738,9 +1253,9 @@ def l1_loss(input, label, reduction='mean', name=None):
         else:
             return unreduced
 
-    fluid.data_feeder.check_variable_and_dtype(
+    check_variable_and_dtype(
         input, 'input', ['float32', 'float64', 'int32', 'int64'], 'l1_loss')
-    fluid.data_feeder.check_variable_and_dtype(
+    check_variable_and_dtype(
         label, 'label', ['float32', 'float64', 'int32', 'int64'], 'l1_loss')
 
     if reduction == 'sum':
@@ -847,10 +1362,8 @@ def nll_loss(input,
         label = reshape(label, shape=[n, 1, -1])
         out_shape = [n] + input_shape[2:]
 
-    fluid.data_feeder.check_variable_and_dtype(
-        input, 'input', ['float32', 'float64'], 'nll_loss')
-    fluid.data_feeder.check_variable_and_dtype(label, 'label', ['int64'],
-                                               'nll_loss')
+    check_variable_and_dtype(input, 'input', ['float32', 'float64'], 'nll_loss')
+    check_variable_and_dtype(label, 'label', ['int64'], 'nll_loss')
     inputs = {'X': input, 'Label': label}
     attrs = {'reduction': reduction, 'ignore_index': ignore_index}
     if weight is not None:
@@ -971,10 +1484,8 @@ def kl_div(input, label, reduction='mean', name=None):
 
     helper = LayerHelper('kl_div', **locals())
 
-    fluid.data_feeder.check_variable_and_dtype(input, 'input',
-                                               ['float32', 'float64'], 'kl_div')
-    fluid.data_feeder.check_variable_and_dtype(label, 'label',
-                                               ['float32', 'float64'], 'kl_div')
+    check_variable_and_dtype(input, 'input', ['float32', 'float64'], 'kl_div')
+    check_variable_and_dtype(label, 'label', ['float32', 'float64'], 'kl_div')
     fluid.data_feeder.check_type(reduction, 'reduction', str, 'kl_div')
 
     loss = helper.create_variable_for_type_inference(dtype=input.dtype)
@@ -1051,10 +1562,10 @@ def mse_loss(input, label, reduction='mean', name=None):
             "but received {}.".format(reduction))
 
     if not in_dynamic_mode():
-        paddle.fluid.data_feeder.check_variable_and_dtype(
-            input, 'input', ['float32', 'float64'], 'mse_loss')
-        paddle.fluid.data_feeder.check_variable_and_dtype(
-            label, 'label', ['float32', 'float64'], 'mse_loss')
+        check_variable_and_dtype(input, 'input', ['float32', 'float64'],
+                                 'mse_loss')
+        check_variable_and_dtype(label, 'label', ['float32', 'float64'],
+                                 'mse_loss')
 
     if reduction == 'none':
         return paddle.square(paddle.subtract(input, label), name=name)
@@ -1858,9 +2369,9 @@ def cross_entropy(input,
                 out = paddle.squeeze(out, axis=axis)
             return out
 
-    fluid.data_feeder.check_variable_and_dtype(
-        input, 'input', ['float32', 'float64'], 'softmax_cross_entropy')
-    fluid.data_feeder.check_variable_and_dtype(
+    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
+                             'softmax_cross_entropy')
+    check_variable_and_dtype(
         label, 'label',
         ['uint8', 'int8', 'int16', 'int32', 'int64', 'float32', 'float64'],
         'softmax_cross_entropy')
@@ -1887,8 +2398,8 @@ def cross_entropy(input,
         attrs=attrs)
 
     if weight is not None:
-        fluid.data_feeder.check_variable_and_dtype(
-            weight, 'weight', ['float32', 'float64'], 'softmax_cross_entropy')
+        check_variable_and_dtype(weight, 'weight', ['float32', 'float64'],
+                                 'softmax_cross_entropy')
         weight_name = name if reduction == 'none' else None
         if soft_label == True:
             # chajchaj:
@@ -2050,9 +2561,8 @@ def sigmoid_focal_loss(logit,
             % reduction)
 
     if normalizer is not None:
-        fluid.data_feeder.check_variable_and_dtype(normalizer, 'normalizer',
-                                                   ['float32', 'float64'],
-                                                   'sigmoid_focal_loss')
+        check_variable_and_dtype(normalizer, 'normalizer',
+                                 ['float32', 'float64'], 'sigmoid_focal_loss')
         normalizer_shape = list(normalizer.shape)
         normalizer_dims = len(normalizer_shape)
         if normalizer_dims > 1:
@@ -2102,10 +2612,10 @@ def sigmoid_focal_loss(logit,
 
         return loss
 
-    fluid.data_feeder.check_variable_and_dtype(
-        logit, 'logit', ['float32', 'float64'], 'sigmoid_focal_loss')
-    fluid.data_feeder.check_variable_and_dtype(
-        label, 'label', ['float32', 'float64'], 'sigmoid_focal_loss')
+    check_variable_and_dtype(logit, 'logit', ['float32', 'float64'],
+                             'sigmoid_focal_loss')
+    check_variable_and_dtype(label, 'label', ['float32', 'float64'],
+                             'sigmoid_focal_loss')
 
     bce_name = None
     if reduction == 'none' and normalizer is None:
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index e719099b4b39d..f64e731342ed2 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -407,8 +407,10 @@ def instance_norm(x,
           print(instance_norm_out)
 
     """
-
-    if in_dynamic_mode():
+    if in_dygraph_mode():
+        out, _, _, = _C_ops.final_state_instance_norm(x, weight, bias, eps)
+        return out
+    if _in_legacy_dygraph():
         out, _, _ = _C_ops.instance_norm(x, weight, bias, "epsilon", eps,
                                          "momentum", momentum, "data_format",
                                          data_format)
diff --git a/python/paddle/nn/initializer/normal.py b/python/paddle/nn/initializer/normal.py
index 6fee5058057cb..2d0cd77ee17e9 100644
--- a/python/paddle/nn/initializer/normal.py
+++ b/python/paddle/nn/initializer/normal.py
@@ -60,19 +60,19 @@ def __init__(self, mean=0.0, std=1.0, name=None):
 
 
 class TruncatedNormal(TruncatedNormalInitializer):
-    """The Random TruncatedNormal (Gaussian) distribution initializer.
+    """The truncated normal distribution (Gaussian distribution) initializer.
 
     Args:
-        mean (float, optional): mean of the normal distribution. The default value is 0.0.
-        std (float, optional): standard deviation of the normal distribution. The default value is 1.0.
-        name(str, optional): The default value is None. Normally there is no need for user to set this
-            property. For more information, please refer to :ref:`api_guide_Name`.
+        mean (float, optional): Mean of the normal distribution. The default value is :math:`0.0`.
+        std (float, optional): Standard deviation of the normal distribution. The default value is :math:`1.0`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
-        A parameter initialized by Random TruncatedNormal (Gaussian) distribution.
+        A parameter initialized by truncated normal distribution (Gaussian distribution).
 
     Examples:
         .. code-block:: python
+            :name: initializer_TruncatedNormal-example
 
             import paddle
 
diff --git a/python/paddle/nn/initializer/uniform.py b/python/paddle/nn/initializer/uniform.py
index cac03b5948071..f07883adbb0ae 100644
--- a/python/paddle/nn/initializer/uniform.py
+++ b/python/paddle/nn/initializer/uniform.py
@@ -18,19 +18,19 @@
 
 
 class Uniform(UniformInitializer):
-    """The random uniform distribution initializer.
+    """The uniform distribution initializer.
 
     Args:
-        low (float, optional): lower boundary of the uniform distribution. The default value is -1.0.
-        high (float, optional): upper boundary of the uniform distribution. The default value is 1.0.
-        name(str, optional): The default value is None. Normally there is no need for user to set this
-            property. For more information, please refer to :ref:`api_guide_Name`.
+        low (float, optional): Lower boundary of the uniform distribution. The default value is :math:`-1.0`.
+        high (float, optional): Upper boundary of the uniform distribution. The default value is :math:`1.0`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
-        A parameter initialized by random uniform distribution.
+        A parameter initialized by uniform distribution.
 
     Examples:
         .. code-block:: python
+            :name: initializer_Uniform-example
 
             import paddle
 
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index 239989b44e121..ae6aa4ce387eb 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -26,6 +26,7 @@
 from .activation import Sigmoid  # noqa: F401
 from .activation import Softmax  # noqa: F401
 from .activation import LogSoftmax  # noqa: F401
+from .activation import RReLU  # noqa: F401
 from .activation import Softmax2D  # noqa: F401
 from .common import Bilinear  # noqa: F401
 from .common import Pad1D  # noqa: F401
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index 7fd109843bede..1a3768e919042 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -436,6 +436,93 @@ def extra_repr(self):
             name_str)
 
 
+class RReLU(Layer):
+    r"""
+    RReLU activation layer.
+
+    Applies the randomized leaky rectified liner unit function to improve generalization performance,
+    as described in the paper:
+    `Empirical Evaluation of Rectified Activations in Convolutional Network <https://arxiv.org/abs/1505.00853>`_
+
+    During training, randomly samples the negative slope for activation values as described below:
+
+    .. math::
+
+        RReLU(x)=
+            \left\{
+                \begin{array}{rcl}
+                    x, & & if \ x >= 0 \\
+                    a * x, & & otherwise \\
+                \end{array}
+            \right.
+
+    where :math:`x` is the input tensor,
+    :math:`a` is randomly sampled from uniform distribution in range (:math:`lower`, :math:`upper`),
+
+    In the test phase, the negative slope will take the average value of :math:`lower` and :math:`upper`:
+
+    .. math::
+
+        RReLU(x)=
+            \left\{
+                \begin{array}{rcl}
+                    x, & & if \ x >= 0 \\
+                    (lower + upper) * 0.5 * x, & & otherwise \\
+                \end{array}
+            \right.
+
+    where :math:`x` is the input tensor,
+    :math:`lower` and :math:`upper` are the bounds of uniform distribution.
+
+    Parameters:
+        lower (float, optional): The lower bound of uniform distribution. Default: 0.125.
+        upper (float, optional): The upper bound of uniform distribution. Default: 0.333.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape. Default dtype is float32.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+            :name: RReLU-example
+
+            import paddle
+
+            input_tensor = paddle.to_tensor([[[[-2.0,  3.0, -4.0,  5.0],
+                                            [ 3.0, -4.0,  5.0, -6.0],
+                                            [-7.0, -8.0,  8.0,  9.0]],
+                                            [[ 1.0, -2.0, -3.0,  4.0],
+                                            [-5.0,  6.0,  7.0, -8.0],
+                                            [ 6.0,  7.0,  8.0,  9.0]]]], dtype='float32')
+
+            rrelu_layer = paddle.nn.RReLU(0.1, 0.3)
+            output = rrelu_layer(input_tensor)
+            #[[[[-0.20000899  3.         -0.88108218  5.        ]
+            #   [ 3.         -0.55175185  5.         -1.07761011]
+            #   [-1.06806871 -1.98962009  8.          9.        ]]
+            #  [[ 1.         -0.52382672 -0.65515128  4.        ]
+            #   [-1.37663394  6.          7.         -2.34657836]
+            #   [ 6.          7.          8.          9.        ]]]]
+    """
+
+    def __init__(self, lower=1. / 8., upper=1. / 3., name=None):
+        super(RReLU, self).__init__()
+        self._lower = lower
+        self._upper = upper
+        self._name = name
+
+    def forward(self, x):
+        return F.rrelu(
+            x, lower=self._lower, upper=self._upper, training=self.training)
+
+    def extra_repr(self):
+        name_str = ', name={}'.format(self._name) if self._name else ''
+        return 'lower={}, upper={}, training={}, dtype={}{}'.format(
+            self._lower, self._upper, self.training, self._dtype, name_str)
+
+
 class ReLU(Layer):
     """
     ReLU Activation.
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 7c3e3ad8dee9f..6cdfc36d5d61f 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -32,7 +32,7 @@
 from ...fluid.dygraph import BatchNorm  # noqa: F401
 from ...fluid.dygraph import SpectralNorm  # noqa: F401
 
-from ...framework import get_default_dtype, set_default_dtype
+from ...framework import get_default_dtype, set_default_dtype, _non_static_mode
 
 from ..initializer import Constant
 from ...framework import ParamAttr
@@ -404,6 +404,25 @@ def __init__(self,
             self.bias.stop_gradient = self._bias_attr != None and self._bias_attr.learning_rate == 0.
 
     def forward(self, input):
+        mean_out = self._helper.create_variable_for_type_inference(
+            dtype=input.dtype, stop_gradient=True)
+        variance_out = self._helper.create_variable_for_type_inference(
+            dtype=input.dtype, stop_gradient=True)
+
+        if _non_static_mode():
+            pre_act, _, _ = _C_ops.group_norm(
+                input,
+                self.weight,
+                self.bias,
+                mean_out,
+                variance_out,
+                'epsilon',
+                self._epsilon,
+                'groups',
+                self._num_groups, )
+            return dygraph_utils._append_activation_in_dygraph(
+                pre_act, act=None)
+
         inputs = {'X': input}
         if self.bias is not None:
             inputs['Bias'] = self.bias
@@ -411,10 +430,6 @@ def forward(self, input):
             inputs['Scale'] = self.weight
 
         # create output
-        mean_out = self._helper.create_variable_for_type_inference(
-            dtype=input.dtype, stop_gradient=True)
-        variance_out = self._helper.create_variable_for_type_inference(
-            dtype=input.dtype, stop_gradient=True)
         group_norm_out = self._helper.create_variable_for_type_inference(
             dtype=input.dtype)
 
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
index 0fa49745a95fb..0b61f3cb9a787 100644
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -12,11 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import warnings
+from collections import defaultdict
 from .optimizer import Optimizer
-from .adam import Adam
+from .lr import LRScheduler
 from ..fluid import core
 from ..fluid import framework
-from ..fluid.framework import Variable
+from ..fluid.framework import Variable, Parameter
+from ..fluid import unique_name
+from ..fluid import layers
+from ..fluid.layer_helper import LayerHelper
+from ..fluid.clip import GradientClipBase
 from ..fluid.dygraph import base as imperative_base
 from collections.abc import Callable
 from .. import _C_ops
@@ -25,7 +31,7 @@
 __all__ = []
 
 
-class AdamW(Adam):
+class AdamW(Optimizer):
     r"""
     The AdamW optimizer is implemented based on the AdamW Optimization
     in paper `DECOUPLED WEIGHT DECAY REGULARIZATION <https://arxiv.org/pdf/1711.05101.pdf>`_.
@@ -102,14 +108,14 @@ class AdamW(Adam):
             beta1 = paddle.to_tensor([0.9], dtype="float32")
             beta2 = paddle.to_tensor([0.99], dtype="float32")
 
-            adam = paddle.optimizer.AdamW(learning_rate=0.1,
+            opt = paddle.optimizer.AdamW(learning_rate=0.1,
                     parameters=linear.parameters(),
                     beta1=beta1,
                     beta2=beta2,
                     weight_decay=0.01)
             out.backward()
-            adam.step()
-            adam.clear_grad()
+            opt.step()
+            opt.clear_grad()
 
 
             #Note that the learning_rate of linear_2 is 0.01.
@@ -119,7 +125,7 @@ class AdamW(Adam):
             out = linear_1(inp)
             out = linear_2(out)
             loss = paddle.mean(out)
-            adam = paddle.optimizer.AdamW(
+            opt = paddle.optimizer.AdamW(
                 learning_rate=0.1,
                 parameters=[{
                     'params': linear_1.parameters()
@@ -132,11 +138,16 @@ class AdamW(Adam):
                 weight_decay=0.01,
                 beta1=0.9)                   
             out.backward()
-            adam.step()
-            adam.clear_grad()
+            opt.step()
+            opt.clear_grad()
 
     """
 
+    _moment1_acc_str = "moment1"
+    _moment2_acc_str = "moment2"
+    _beta1_pow_acc_str = "beta1_pow_acc"
+    _beta2_pow_acc_str = "beta2_pow_acc"
+
     def __init__(self,
                  learning_rate=0.001,
                  beta1=0.9,
@@ -160,37 +171,108 @@ def __init__(self,
             raise ValueError("Invaild value of beta2, expect beta2 in [0,1).")
         if not 0 <= epsilon:
             raise ValueError("Invaild value of epsilon, expect epsilon >= 0.")
-        coeff = weight_decay
-        if not isinstance(coeff, float) and \
-                not isinstance(coeff, framework.Variable):
-            raise TypeError("coeff should be float or Tensor.")
-        self._params_name = set()
-        self._apply_decay_param_fun = apply_decay_param_fun
-        self._coeff = coeff
-        self._lr_to_coeff = dict()
+        if not isinstance(weight_decay, float) and \
+                not isinstance(weight_decay, framework.Variable):
+            raise TypeError("weight_decay should be float or Tensor.")
         if lr_ratio is not None:
             assert isinstance(lr_ratio, Callable)
             if not core.is_compiled_with_cuda():
                 raise NotImplementedError(
                     "'lr_ratio' is unimplemented in CPU, XPU and NPU")
-        self._lr_ratio = lr_ratio
 
-        super(AdamW, self).__init__(
-            learning_rate=learning_rate,
-            parameters=parameters,
-            beta1=beta1,
-            beta2=beta2,
-            epsilon=epsilon,
-            grad_clip=grad_clip,
-            name=name,
-            lazy_mode=lazy_mode,
-            multi_precision=multi_precision)
-        self._default_dict = {'coeff': coeff}
+        if parameters is not None:
+            # paddle.Tensor is also iterable, so here we don't check whether
+            # the input is iterable, if the input is paddle.Tensor, the
+            # list(paddle.Tensor) will be a error value
+            if isinstance(parameters, (paddle.Tensor, core.eager.Tensor)):
+                raise TypeError(
+                    "`parameters` argument given to the optimizer should be "
+                    "an iterable of paddle Tensors, but got argument type is `{}`.".
+                    format(type(parameters)))
+            if isinstance(parameters, dict):
+                raise TypeError(
+                    "`parameters` argument should not get dict type, "
+                    "if parameter groups is needed, please set `parameters`"
+                    " as list of dict")
+            self._parameter_list = list(parameters)
+        else:
+            self._parameter_list = None
+
+        self._name = name
+        if framework._non_static_mode():
+            if self._parameter_list is None:
+                raise AttributeError(
+                    "parameters argument given to the Optimizer should not be None in dygraph mode."
+                )
+
+        if not isinstance(learning_rate, (float, LRScheduler)):
+            raise TypeError(
+                "learning rate should be float or LRScheduler, got %s here" %
+                type(learning_rate))
+        if grad_clip is not None:
+            if not isinstance(grad_clip, GradientClipBase):
+                raise TypeError(
+                    "'grad_clip' should be an instance of GradientClipBase's derived class"
+                )
+
+        self._dtype = None
+        # Infer the dtype form parameter
+        if self._parameter_list:
+            if isinstance(self._parameter_list[0], dict):
+                for param_group in self._parameter_list:
+                    assert 'params' in param_group, \
+                        'params should be set in parameters if parameter groups are optimized in different options'
+                self._dtype = self._parameter_list[0]['params'][0].dtype
+            else:
+                self._dtype = self._parameter_list[0].dtype
+
+        # each program should have a independent learning rate
+        # program -> tensor(learning_rate)
+        self._learning_rate_map = dict()
+        # Dictionary of accumulators. Some optimizer subclasses need to
+        # allocate and manage extra tensors associated with the parameters
+        # to train. These tensors are called accumulators.
+        # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
+        self._accumulators = defaultdict(lambda: dict())
+        self.helper = None
+        self._opti_name_list = []
+        self._accumulators_holder = {}
+        self._param_device_map = dict()
+        self.clear_gradients = self.clear_grad
 
         self.type = "adamw"
+        self._learning_rate = learning_rate
+        self._params_name = set()
+        self._apply_decay_param_fun = apply_decay_param_fun
+        self._weight_decay = weight_decay
+        self._grad_clip = grad_clip
+        self._lr_ratio = lr_ratio
+        self._beta1 = beta1
+        self._beta2 = beta2
+        self._epsilon = epsilon
+        self._lazy_mode = lazy_mode
+        self._multi_precision = multi_precision
+        self._master_weights = {}
+
+        self._default_dict = {
+            'weight_decay': weight_decay,
+            'beta1': beta1,
+            'beta2': beta2,
+            'epsilon': epsilon,
+            'lazy_mode': lazy_mode,
+            'grad_clip': grad_clip
+        }
+
+        self._param_groups = []
+        if self._parameter_list and isinstance(self._parameter_list[0], dict):
+            for param_group in self._parameter_list:
+                self._add_param_group(param_group.copy())
+        else:
+            self._param_groups = self._parameter_list
 
-        # Use _auxiliary_vars together with _set_auxiliary_var/_get_auxiliary_var to achieve that.
-        self._auxiliary_vars = dict()
+        self._use_multi_tensor = None
+        self.regularization = None
+        self._auxiliary_vars = {}
 
     def _set_auxiliary_var(self, key, val):
         self._auxiliary_vars[key] = val
@@ -201,58 +283,128 @@ def _get_auxiliary_var(self, key):
         else:
             return None
 
-    def _append_decoupled_weight_decay(self, block, param_and_grad):
+    def _add_param_group(self, param_group):
         """
-        Add decoupled weight decay op.
-            parameter = parameter - parameter * coeff * lr
+        Add a param group to parameter_list.
+
         Args:
-            block: block in which variable is to be created
-            param_and_grad: (parameters, gradients) pairs,
-                the parameters need to decay.
-        Raises:
-            Exception: The type of coeff and parameter is not consistent.
+            param_group (dict): The group of Tensors to be optimzed with
+            different optimization options.
         """
-        if isinstance(param_and_grad, dict):
-            param_and_grad = self._update_param_group(param_and_grad)
-        param, grad = param_and_grad
+        params = param_group['params']
+        if isinstance(params, Parameter):
+            param_group['params'] = [params]
+        elif isinstance(params, set):
+            raise TypeError(
+                "optimizer parameters should be in ordered collections,"
+                "but received set, please use list instead.")
+        else:
+            param_group['params'] = list(params)
 
-        if self._apply_decay_param_fun is not None \
-                and not self._apply_decay_param_fun(param.name):
-            return
+        # Update optimization options for each groups
+        for k, v in self._default_dict.items():
+            param_group.setdefault(k, v)
+
+        param_set = set()
+        for group in self._param_groups:
+            param_set.update(set(group['params']))
+
+        if not param_set.isdisjoint(set(param_group['params'])):
+            raise ValueError(
+                "some parameters appear in more than one parameter group")
 
-        if isinstance(self._learning_rate, float):
-            learning_rate = self._learning_rate
+        for param in param_group['params']:
+            param.optimize_attr['learning_rate'] = param_group.get(
+                'learning_rate', 1.)
+
+        self._param_groups.append(param_group)
+
+    def _create_master_weight(self, param):
+        if param.name in self._master_weights:
+            var = self._master_weights[param.name]
         else:
-            # NOTE. We add this function to the _append_optimize_op(),
-            # for we must make sure _create_param_lr() be called after
-            # optimizer._create_global_learning_rate().
-            learning_rate = self._create_param_lr(param_and_grad)
-
-        with block.program._optimized_guard(
-            [param, grad]), framework.name_scope('weight decay'):
-            self._params_name.add(param.name)
-
-            # If it has been calculated, the result will be reused.
-            # NOTE(wangxi): In dygraph mode, apply_gradient will be executed
-            # every step, so need clear _lr_to_coeff every step,
-            # we do this in _create_optimization_pass
-            decay_coeff = self._lr_to_coeff.get(learning_rate, None)
-            if decay_coeff is None:
-                # NOTE(wangxi): for pipeline to set device:all
-                with paddle.static.device_guard(None):
-                    decay_coeff = 1.0 - learning_rate * self._coeff
-                self._lr_to_coeff[learning_rate] = decay_coeff
-
-            find_master = (self._multi_precision and
-                           param.dtype == core.VarDesc.VarType.FP16)
-            if find_master:
-                master_weight = self._master_weights[param.name]
-                scaled_param = master_weight * decay_coeff
-                paddle.fluid.layers.assign(
-                    input=scaled_param, output=master_weight)
-            else:
-                scaled_param = param * decay_coeff
-                paddle.fluid.layers.assign(input=scaled_param, output=param)
+            assert isinstance(self.helper, LayerHelper)
+
+            var_name = param.name + "_fp32_master"
+            var_name = unique_name.generate(var_name)
+            var = layers.create_global_var(
+                name=var_name,
+                shape=param.shape,
+                value=0,
+                dtype='float32',
+                persistable=True)
+            block = self.helper.startup_program.global_block()
+            block.append_op(
+                type="cast",
+                inputs={"X": [param]},
+                outputs={"Out": [var]},
+                attrs={
+                    "in_dtype": param.dtype,
+                    "out_dtype": core.VarDesc.VarType.FP32
+                })
+            self._master_weights[param.name] = var
+        return var
+
+    def _get_accumulator(self, name, param):
+        """Utility function to fetch an accumulator for a parameter
+        Args:
+            name: name of the accumulator
+            param: parameter variable for which accumulator is to be fetched
+        Returns:
+            accumulator variable for the parameter
+        """
+        if self._name is not None:
+            name = self._name + "_" + name
+        find_master = self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
+        target_param = self._master_weights[
+            param.name] if find_master else param
+        target_name = target_param.name
+        if (name not in self._accumulators or
+                target_name not in self._accumulators[name]):
+            raise Exception("Accumulator {} does not exist for parameter {}".
+                            format(name, target_name))
+        return self._accumulators[name][target_name]
+
+    def _add_moments_pows(self, p):
+        acc_dtype = p.dtype
+        if acc_dtype == core.VarDesc.VarType.FP16:
+            acc_dtype = core.VarDesc.VarType.FP32
+        self._add_accumulator(self._moment1_acc_str, p, dtype=acc_dtype)
+        self._add_accumulator(self._moment2_acc_str, p, dtype=acc_dtype)
+        self._add_accumulator(
+            name=self._beta1_pow_acc_str,
+            param=p,
+            dtype=acc_dtype,
+            fill_value=0.9 if isinstance(self._beta1, Variable) \
+                    else self._beta1,
+            shape=[1],
+            type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
+        self._add_accumulator(
+            name=self._beta2_pow_acc_str,
+            param=p,
+            dtype=acc_dtype,
+            fill_value=0.999 if isinstance(self._beta2, Variable) \
+                    else self._beta2,
+            shape=[1],
+            type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
+
+    def _create_accumulators(self, block, parameters):
+        assert isinstance(block, framework.Block)
+        if isinstance(parameters, dict):
+            parameters = self._update_param_group(parameters)
+
+        # Create accumulator tensors for first and second moments
+        for p in parameters:
+            if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16:
+                master_p = self._create_master_weight(p)
+                self._add_moments_pows(master_p)
+                continue
+            if p.dtype == core.VarDesc.VarType.FP16 and not self._multi_precision:
+                warnings.warn(
+                    "Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
+                    "Consider using multi_precision=True option of the Adam optimizer."
+                )
+            self._add_moments_pows(p)
 
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
@@ -295,8 +447,9 @@ def _append_optimize_op(self, block, param_and_grad):
                 _, _, _, _, _, _ = _C_ops.final_state_adamw(
                     param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
                     beta1_pow_acc, beta2_pow_acc, master_weight, found_inf,
-                    _beta1, _beta2, self._epsilon, lr_ratio_, self._coeff,
-                    with_decay, self._lazy_mode, 1000, find_master, False)
+                    _beta1, _beta2, self._epsilon, lr_ratio_,
+                    self._weight_decay, with_decay, self._lazy_mode, 1000,
+                    find_master, False)
             else:
                 _, _, _, _, _, _ = _C_ops.adamw(
                     param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
@@ -306,8 +459,8 @@ def _append_optimize_op(self, block, param_and_grad):
                     'lazy_mode', self._lazy_mode,
                     'min_row_size_to_use_multithread', 1000, 'beta1', _beta1,
                     'beta2', _beta2, "with_decay", with_decay, 'coeff',
-                    self._coeff, 'multi_precision', find_master, 'lr_ratio',
-                    lr_ratio_)
+                    self._weight_decay, 'multi_precision', find_master,
+                    'lr_ratio', lr_ratio_)
             return None
 
         inputs = {
@@ -338,7 +491,7 @@ def _append_optimize_op(self, block, param_and_grad):
             "min_row_size_to_use_multithread": 1000,
             "multi_precision": find_master,
             "with_decay": with_decay,
-            "coeff": self._coeff,
+            "coeff": self._weight_decay,
             "lr_ratio": 1.
             if self._lr_ratio is None else self._lr_ratio(param_and_grad[0])
         }
@@ -369,17 +522,96 @@ def _append_optimize_op(self, block, param_and_grad):
 
         return adamw_op
 
-    def _create_optimization_pass(self, parameters_and_grads):
-        optimize_ops = super(
-            AdamW, self)._create_optimization_pass(parameters_and_grads)
-        # In dygraph mode, clear _lr_to_coeff after applied gradient
-        self._lr_to_coeff = dict()
-        return optimize_ops
-
     def __str__(self):
         return " ".join(["Weight Decay, params:", ",".join(self._params_name)])
 
+    @imperative_base.no_grad
+    @framework.dygraph_only
+    def step(self):
+        """
+        Execute the optimizer and update parameters once.
+
+        Returns:
+            None
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                
+                a = paddle.rand([2,13], dtype="float32")
+                linear = paddle.nn.Linear(13, 5)
+                # This can be any optimizer supported by dygraph.
+                opt = paddle.optimizer.AdamW(learning_rate = 0.01,
+                                            parameters = linear.parameters())
+                out = linear(a)
+                out.backward()
+                opt.step()
+                opt.clear_grad()
+        """
+        if not isinstance(self._parameter_list[0], dict):
+            params_grads = []
+            for param in self._parameter_list:
+                if param.stop_gradient:
+                    continue
+                if param._grad_ivar() is not None:
+                    grad_var = param._grad_ivar()
+                    if framework.in_dygraph_mode():
+                        if hasattr(grad_var, "is_selected_rows"
+                                   ) and grad_var.is_selected_rows(
+                                   ) and self.regularization is not None:
+                            raise RuntimeError(
+                                "AdamW don't support weight_decay with sparse parameters, please set it to None."
+                            )
+                    else:
+                        if hasattr(grad_var,
+                                   "_is_sparse") and grad_var._is_sparse(
+                                   ) and self.regularization is not None:
+                            raise RuntimeError(
+                                "AdamW don't support weight_decay with sparse parameters, please set it to None."
+                            )
+                    params_grads.append((param, grad_var))
+
+            optimize_ops = self._apply_optimize(
+                loss=None, startup_program=None, params_grads=params_grads)
+        else:
+            # optimize parameters in groups
+            for param_group in self._param_groups:
+                params_grads = defaultdict(lambda: list())
+                for param in param_group['params']:
+                    if param.stop_gradient:
+                        continue
+                    if param._grad_ivar() is not None:
+                        grad_var = param._grad_ivar()
+                        if framework.in_dygraph_mode():
+                            if hasattr(grad_var, "is_selected_rows"
+                                       ) and grad_var.is_selected_rows(
+                                       ) and self.regularization is not None:
+                                raise RuntimeError(
+                                    "AdamW don't support weight_decay with sparse parameters, please set it to None."
+                                )
+                        else:
+                            if hasattr(grad_var,
+                                       "_is_sparse") and grad_var._is_sparse(
+                                       ) and self.regularization is not None:
+                                raise RuntimeError(
+                                    "AdamW don't support weight_decay with sparse parameters, please set it to None."
+                                )
+                        params_grads['params'].append((param, grad_var))
+                params_grads.update(
+                    {k: v
+                     for k, v in param_group.items() if k != 'params'})
+                self._apply_optimize(
+                    loss=None, startup_program=None, params_grads=params_grads)
+
     def _update_param_group(self, parameters):
-        self._coeff = parameters.get('coeff', self._default_dict['coeff'])
+        self._beta1 = parameters.get('beta1', self._default_dict['beta1'])
+        self._beta2 = parameters.get('beta2', self._default_dict['beta2'])
+        self._epsilon = parameters.get('epsilon', self._default_dict['epsilon'])
+        self._lazy_mode = parameters.get('lazy_mode',
+                                         self._default_dict['lazy_mode'])
+        self._weight_decay = parameters.get('weight_decay',
+                                            self._default_dict['weight_decay'])
         parameters = parameters.get('params')
+
         return parameters
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index 9dfec3947e95f..cf180fccc4857 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -58,6 +58,8 @@ def append_backward_new(loss_list,
     program = default_main_program()
     assert program.num_blocks == 1, "The append_backward_new interface is designed to process only one block."
     block = program.current_block()
+    for el in loss_list:
+        assert el.block == block, f'variable in loss_list should be in current block of main program'
 
     orig2prim(block)
     ad = Transform(block)
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 283bce1cc817f..478f4b6351fbf 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -263,6 +263,7 @@
 from .stat import var  # noqa: F401
 from .stat import numel  # noqa: F401
 from .stat import median  # noqa: F401
+from .stat import nanmedian  # noqa: F401
 from .stat import quantile  # noqa: F401
 from .stat import nanquantile  # noqa: F401
 
@@ -448,6 +449,7 @@
            'var',
            'numel',
            'median',
+           'nanmedian',
            'quantile',
            'nanquantile',
            'is_complex',
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index d3430ba81b859..5a1aa5dcfcef5 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -482,19 +482,20 @@ def full_like(x, fill_value, dtype=None, name=None):
 def ones(shape, dtype=None, name=None):
     """
 
-    The OP creates a tensor of specified :attr:`shape` and :attr:`dtype`, and fills it with 1.
+    Create a Tensor of specified :attr:`shape` and :attr:`dtype` and fill it with 1.
 
     Args:
-        shape(tuple|list|Tensor): Shape of the Tensor to be created, the data type of shape is int32 or int64.
-        dtype(np.dtype|str, optional): Data type of output Tensor, it supports
-            bool, float16, float32, float64, int32 and int64. Default: if None, the data type is 'float32'.
-        name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`
+        shape (tuple|list|Tensor): Shape of the Tensor to be created, the data type of shape should be int32 or int64.
+        dtype (np.dtype|str, optional): Data type of output Tensor, it should be one of
+            bool, float16, float32, float64, int32 and int64. If it is set to None, the data type will be float32.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
     
     Returns:
-        Tensor: A tensor of data type :attr:`dtype` with shape :attr:`shape` and all elements set to 1.
+        Tensor: A Tensor of data type :attr:`dtype` with shape :attr:`shape` and all elements are 1.
 
     Examples:
         .. code-block:: python
+          :name: ones-example
 
           import paddle 
           
@@ -1568,14 +1569,19 @@ def assign(x, output=None):
         if output is None:
             output = helper.create_variable_for_type_inference(
                 dtype=input.dtype)
-        helper.append_op(
-            type='assign_value',
-            outputs={'Out': [output]},
-            attrs={
-                'dtype': dtype,
-                'shape': list(input.shape),
-                value_name: values
-            })
+        if _non_static_mode():
+            _C_ops.assign_value(output, 'shape',
+                                list(input.shape), 'dtype', dtype, value_name,
+                                values)
+        else:
+            helper.append_op(
+                type='assign_value',
+                outputs={'Out': [output]},
+                attrs={
+                    'dtype': dtype,
+                    'shape': list(input.shape),
+                    value_name: values
+                })
 
     if is_inplace and _in_legacy_dygraph():
         output._bump_inplace_version()
diff --git a/python/paddle/tensor/layer_function_generator.py b/python/paddle/tensor/layer_function_generator.py
index 7f95dd60eda8a..72e5eb640125d 100644
--- a/python/paddle/tensor/layer_function_generator.py
+++ b/python/paddle/tensor/layer_function_generator.py
@@ -21,7 +21,7 @@
 from six.moves import cStringIO
 from ..static import Variable
 from ..fluid.proto import framework_pb2
-from ..framework import OpProtoHolder, core, convert_np_dtype_to_dtype_, _non_static_mode, in_dygraph_mode
+from ..framework import OpProtoHolder, core, convert_np_dtype_to_dtype_, _non_static_mode, in_dygraph_mode, _in_legacy_dygraph
 from ..framework import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype
 import paddle
@@ -271,9 +271,10 @@ def func(x, name=None):
                                      op_type)
         else:
             # abs exp square ops support dtype(int32, int64, float16, float32, float64)
-            check_variable_and_dtype(
-                x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'],
-                op_type)
+            check_variable_and_dtype(x, 'x', [
+                'int32', 'int64', 'float16', 'float32', 'float64', 'complex64',
+                'complex128'
+            ], op_type)
 
         helper = LayerHelper(op_type, **locals())
 
@@ -302,7 +303,7 @@ def generate_inplace_fn(inplace_op_type):
     origin_op_type = inplace_op_type[:-1]
 
     def func(x, name=None):
-        if paddle.in_dynamic_mode():
+        if _non_static_mode():
             op = getattr(_C_ops, inplace_op_type)
             return op(x)
         warnings.warn(
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index 1194d81a360db..0def896db8f73 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -30,25 +30,24 @@
 def bernoulli(x, name=None):
     """
 
-    This OP returns a Tensor filled with random binary(0 or 1) number from a Bernoulli distribution.
-    The input ``x`` is a tensor with probabilities for generating the random binary number.
-    Each element in ``x`` should be in [0, 1], and the out is generated by:
-    
-    .. math::
+    For each element :math:`x_i` in input ``x``, take a sample from the Bernoulli distribution, also called two-point distribution, with success probability :math:`x_i`. The Bernoulli distribution with success probability :math:`x_i` is a discrete probability distribution with probability mass function
 
-        out_i ~ Bernoulli (x_i)
+    .. math::
+        p(y)=\\begin{cases}
+            x_i,&y=1\\\\
+            1-x_i,&y=0
+        \end{cases}.
 
     Args:
-        x(Tensor):  A tensor with probabilities for generating the random binary number. The data type 
-            should be float32, float64.
-        name(str, optional): The default value is None. Normally there is no
-            need for user to set this property. For more information, please
-            refer to :ref:`api_guide_Name`.
+        x (Tensor): The input Tensor, it's data type should be float32, float64.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
+
     Returns: 
-        Tensor: A Tensor filled with random binary number with the same shape and dtype as ``x``.
+        Tensor: A Tensor filled samples from Bernoulli distribution, whose shape and dtype are same as ``x``.
 
     Examples:
         .. code-block:: python
+            :name: bernoulli-example
 
             import paddle
 
@@ -86,7 +85,7 @@ def bernoulli(x, name=None):
 
 def poisson(x, name=None):
     r"""
-    This OP returns a tensor filled with random number from a Poisson Distribution.
+    Returns a tensor filled with random number from a Poisson Distribution.
 
     .. math::
 
@@ -129,7 +128,7 @@ def poisson(x, name=None):
 
 def multinomial(x, num_samples=1, replacement=False, name=None):
     """
-    This OP returns a Tensor filled with random values sampled from a Multinomical
+    Returns a Tensor filled with random values sampled from a Multinomical
     distribution. The input ``x`` is a tensor with probabilities for generating the
     random number. Each element in ``x`` should be larger or equal to 0, but not all
     0. ``replacement`` indicates whether it is a replaceable sample. If ``replacement``
@@ -278,7 +277,7 @@ def gaussian(shape, mean=0.0, std=1.0, dtype=None, name=None):
 
 def standard_normal(shape, dtype=None, name=None):
     """
-    This OP returns a Tensor filled with random values sampled from a standard
+    Returns a Tensor filled with random values sampled from a standard
     normal distribution with mean 0 and standard deviation 1, with ``shape``
     and ``dtype``.
 
@@ -387,7 +386,7 @@ def randn(shape, dtype=None, name=None):
 
 def normal(mean=0.0, std=1.0, shape=None, name=None):
     """
-    This OP returns a Tensor filled with random values sampled from a normal
+    Returns a Tensor filled with random values sampled from a normal
     distribution with ``mean`` and ``std`` (standard deviation) .
 
     If ``mean`` is a Tensor, the output Tensor has the same shape and data type as ``mean``.
@@ -475,7 +474,7 @@ def normal(mean=0.0, std=1.0, shape=None, name=None):
 
 def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
     """
-    This OP returns a Tensor filled with random values sampled from a uniform
+    Returns a Tensor filled with random values sampled from a uniform
     distribution in the range [``min``, ``max``), with ``shape`` and ``dtype``.
 
     Examples:
@@ -505,20 +504,16 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
             it will use the seed of the global default generator (which can be set by paddle.seed). 
             Note that if seed is not 0, this operator will always generate the same random numbers every
             time. Default is 0.
-        name(str, optional): The default value is None. Normally there is no
-            need for user to set this property. For more information, please
-            refer to :ref:`api_guide_Name`.
+        name(str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor: A Tensor filled with random values sampled from a uniform
         distribution in the range [``min``, ``max``), with ``shape`` and ``dtype``.
 
-    Raises:
-        TypeError: If ``shape`` is not list, tuple, Tensor.
-        TypeError: If ``dtype`` is not float32, float64.
-
     Examples:
         .. code-block:: python
+          :name: code-example1
             
             import paddle
 
@@ -625,7 +620,7 @@ def uniform_(x, min=-1.0, max=1.0, seed=0, name=None):
 
 def randint(low=0, high=None, shape=[1], dtype=None, name=None):
     """
-    This OP returns a Tensor filled with random integers from a discrete uniform
+    Returns a Tensor filled with random integers from a discrete uniform
     distribution in the range [``low``, ``high``), with ``shape`` and ``dtype``.
     If ``high`` is None (the default), the range is [0, ``low``).
 
@@ -731,7 +726,7 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
 
 def randint_like(x, low=0, high=None, dtype=None, name=None):
     """
-    This OP returns a Tensor filled with random integers from a discrete uniform
+    Returns a Tensor filled with random integers from a discrete uniform
     distribution in the range [``low``, ``high``), with the same shape as ``x``.
     (use ``dtype`` if ``dtype`` is not None) 
     If ``high`` is None (the default), the range is [0, ``low``).
@@ -957,7 +952,7 @@ def randperm(n, dtype="int64", name=None):
 
 def rand(shape, dtype=None, name=None):
     """
-    This OP returns a Tensor filled with random values sampled from a uniform
+    Returns a Tensor filled with random values sampled from a uniform
     distribution in the range [0, 1), with ``shape`` and ``dtype``.
 
     Args:
diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py
index 52ccc60100996..372454b97a6be 100644
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -241,6 +241,103 @@ def numel(x, name=None):
     return out
 
 
+def nanmedian(x, axis=None, keepdim=True, name=None):
+    r"""
+    Compute the median along the specified axis, while ignoring NaNs.
+
+    If the valid count of elements is a even number,
+    the average value of both elements in the middle is calculated as the median.
+
+    Args:
+        x (Tensor): The input Tensor, it's data type can be int32, int64, float16, float32, float64.
+        axis (None|int|list|tuple, optional):
+            The axis along which to perform median calculations ``axis`` should be int or list of int.
+            ``axis`` should be in range [-D, D), where D is the dimensions of ``x`` .
+            If ``axis`` is less than 0, it works the same way as :math:`axis + D`.
+            If ``axis`` is None, median is calculated over all elements of ``x``. Default is None.
+        keepdim (bool, optional): Whether to reserve the reduced dimension(s)
+            in the output Tensor. If ``keepdim`` is True, the dimensions of
+            the output Tensor is the same as ``x`` except in the reduced
+            dimensions(it is of size 1 in this case). Otherwise, the shape of
+            the output Tensor is squeezed in ``axis`` . Default is True.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor, results of median along ``axis`` of ``x``. The output dtype is the same as `x`.
+
+    Examples:
+        .. code-block:: python
+            :name: nanmedian-example
+
+            import paddle
+            x = paddle.to_tensor([[float('nan'), 2. , 3. ], [0. , 1. , 2. ]])
+
+            y1 = x.nanmedian()
+            # y1 is [[2.]]
+
+            y2 = x.nanmedian(0)
+            # y2 is [[0.,  1.5, 2.5]]
+
+            y3 = x.nanmedian(0, keepdim=False)
+            # y3 is [0.,  1.5, 2.5]
+
+            y4 = x.nanmedian((0, 1))
+            # y4 is [[2.]]
+    """
+    if not isinstance(x, Variable):
+        raise TypeError("In median, the input x should be a Tensor.")
+
+    if isinstance(axis, (list, tuple)) and len(axis) == 0:
+        raise ValueError("Axis list should not be empty.")
+
+    dims = len(x.shape)
+    if axis is None:
+        axis = []
+    elif isinstance(axis, tuple):
+        axis = list(axis)
+    elif isinstance(axis, int):
+        axis = [axis]
+
+    if not isinstance(axis, list):
+        raise ValueError(
+            "Axis should be None, int, or a list, element should in range [-rank(x), rank(x))."
+        )
+
+    for i in range(len(axis)):
+        if not isinstance(axis[i], int) or not (axis[i] < dims and
+                                                axis[i] >= -dims):
+            raise ValueError(
+                "Axis should be None, int, or a list, element should in range [-rank(x), rank(x))."
+            )
+        if axis[i] < 0:
+            axis[i] += dims
+
+    if len(axis) != len(set(axis)):
+        raise ValueError("Axis has duplicated elements.")
+
+    if _in_legacy_dygraph():
+        median_index, out = _C_ops.nanmedian(x, 'axis', axis, 'keepdim',
+                                             keepdim)
+        return out
+
+    check_variable_and_dtype(
+        x, 'X', ['int32', 'int64', 'float16', 'float32', 'float64'],
+        'nanmedian')
+
+    helper = LayerHelper('nanmedian', **locals())
+    attrs = {'axis': axis, 'keepdim': keepdim}
+    out = helper.create_variable_for_type_inference(x.dtype)
+    medians = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='nanmedian',
+        inputs={'X': x},
+        outputs={'Out': out,
+                 'MedianIndex': medians},
+        attrs=attrs)
+    return out
+
+
 def median(x, axis=None, keepdim=False, name=None):
     """
     Compute the median along the specified axis.
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index c541891662864..8ed4832a8f751 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -407,6 +407,12 @@
     use_gpudnn : true
   backward : conv2d_transpose_grad
 
+- api : conv3d
+  args : (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search)
+  output : Tensor
+  invoke : conv3d_impl(input, filter, strides, paddings, paddding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search)
+  backward : conv3d_grad
+
 - api : conv3d_transpose
   args : (Tensor x, Tensor filter, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format)
   output : Tensor(out)
@@ -492,6 +498,17 @@
   optional : mask
   backward : deformable_conv_grad
 
+- api : depthwise_conv2d
+  args : (Tensor x, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu)
+  output : Tensor(out)
+  invoke : conv2d_impl(x, filter, strides, paddings, padding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search)
+  backward : depthwise_conv2d_grad
+  # infer_meta :
+  #   func : ConvTransposeInferMeta
+  #   prams: [x, filter, strides, paddings, padding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search]
+  # kernel :
+  #   func : depthwise_conv2d
+
 - api : depthwise_conv2d_transpose
   args : (Tensor x, Tensor filter, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format)
   output : Tensor(out)
@@ -613,6 +630,12 @@
     func : elu
   backward : elu_grad
 
+- api : embedding
+  args : (Tensor x, Tensor weight, int64_t padding_idx=-1, bool sparse=false)
+  output : Tensor
+  invoke : embedding_impl(x, weight, padding_idx, sparse)
+  backward : embedding_grad
+
 - api : empty
   args : (IntArray shape, DataType dtype=DataType::FLOAT32, Place place=CPUPlace())
   output: Tensor
@@ -1007,6 +1030,17 @@
     data_type : x
   backward : index_select_grad
 
+- api : instance_norm
+  args : (Tensor x, Tensor scale, Tensor bias, float epsilon)
+  output : Tensor(y), Tensor(saved_mean), Tensor(saved_variance)
+  infer_meta :
+    func : InstanceNormInferMeta
+  kernel :
+    func : instance_norm
+    data_type : x
+  optional : scale, bias
+  backward : instance_norm_grad
+
 # is_empty
 - api : is_empty
   args : (Tensor x)
diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py
index 1638f6afab20c..1f19dec992d2f 100644
--- a/python/paddle/utils/code_gen/api_base.py
+++ b/python/paddle/utils/code_gen/api_base.py
@@ -48,8 +48,7 @@ def __init__(self, api_item_yaml):
                 'func']) == 1 or not self.kernel['func'][1].endswith(
                     '_sr') else True
             self.data_transform = self.parse_data_transform(api_item_yaml)
-            self.inplace_map, self.view_map = self.parse_inplace_and_view(
-                api_item_yaml)
+            self.inplace_map, self.view_map = {}, {}
 
     def get_api_name(self, api_item_yaml):
         return api_item_yaml['api']
@@ -303,31 +302,6 @@ def parse_data_transform(self, api_item_yaml):
 
         return data_transform
 
-    def parse_inplace_and_view(self, api_item_yaml):
-        inplace_map, view_map = {}, {}
-        for mode in ['inplace', 'view']:
-            if mode in api_item_yaml:
-                if mode == 'inplace':
-                    inplace_map = {}
-                else:
-                    view_map = {}
-                in_out_mapping_list = api_item_yaml[mode].split(',')
-                for item in in_out_mapping_list:
-                    result = re.search(r"(?P<in>\w+)\s*->\s*(?P<out>\w+)", item)
-                    in_val = result.group('in')
-                    out_val = result.group('out')
-                    assert in_val in self.inputs['names'], \
-                        f"{self.api} : {mode} input error: the input var name('{in_val}') is not found in the input args of {self.api}."
-                    assert out_val in self.outputs['names'], \
-                        f"{self.api} : {mode} output error: the output var name('{out_val}') is not found in the output args of {self.api}."
-
-                    if mode == 'inplace':
-                        inplace_map[out_val] = in_val
-                    else:
-                        view_map[out_val] = in_val
-
-        return inplace_map, view_map
-
     # Override by child class
     def get_return_type(self, inplace_flag=False):
         return None
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index c0923adf39c46..1721da19295d5 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -30,6 +30,8 @@ def __init__(self, api_item_yaml):
         super(ForwardAPI, self).__init__(api_item_yaml)
         self.is_dygraph_api, self.intermediate_outs = self.parse_intermediate(
             api_item_yaml)
+        self.inplace_map, self.view_map = self.parse_inplace_and_view(
+            api_item_yaml)
 
     def get_api_func_name(self):
         if self.is_dygraph_api:
@@ -47,6 +49,31 @@ def parse_intermediate(self, api_item_yaml):
         else:
             return False, []
 
+    def parse_inplace_and_view(self, api_item_yaml):
+        inplace_map, view_map = {}, {}
+        for mode in ['inplace', 'view']:
+            if mode in api_item_yaml:
+                if mode == 'inplace':
+                    inplace_map = {}
+                else:
+                    view_map = {}
+                in_out_mapping_list = api_item_yaml[mode].split(',')
+                for item in in_out_mapping_list:
+                    result = re.search(r"(?P<in>\w+)\s*->\s*(?P<out>\w+)", item)
+                    in_val = result.group('in')
+                    out_val = result.group('out')
+                    assert in_val in self.inputs['names'], \
+                        f"{self.api} : {mode} input error: the input var name('{in_val}') is not found in the input args of {self.api}."
+                    assert out_val in self.outputs['names'], \
+                        f"{self.api} : {mode} output error: the output var name('{out_val}') is not found in the output args of {self.api}."
+
+                    if mode == 'inplace':
+                        inplace_map[out_val] = in_val
+                    else:
+                        view_map[out_val] = in_val
+
+        return inplace_map, view_map
+
     def get_return_type_with_intermediate(self, inplace_flag=False):
         out_type_list = []
         for i, out_type in enumerate(self.outputs['types']):
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index 360425a30ccad..6a555fd24a066 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -32,6 +32,7 @@
     param : [x]
   kernel :
     func : acos_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : acosh_grad
   forward : acosh (Tensor x) -> Tensor(out)
@@ -42,6 +43,7 @@
     param : [x]
   kernel :
     func : acosh_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : add_double_grad
   forward : add_grad (Tensor x, Tensor y, Tensor grad_out, int axis = -1) -> Tensor(grad_x), Tensor(grad_y)
@@ -54,6 +56,7 @@
     func : add_double_grad
   optional : grad_x_grad, grad_y_grad
   backward : add_triple_grad
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : add_grad
   forward : add (Tensor x, Tensor y) -> Tensor(out)
@@ -84,6 +87,7 @@
     param : [grad_grad_x, grad_grad_y]
   kernel :
     func : add_triple_grad
+  inplace : (grad_grad_out_grad -> grad_grad_x_grad)
 
 - backward_api : addmm_grad
   forward : addmm (Tensor input, Tensor x, Tensor y, float alpha, float beta) -> Tensor(out)
@@ -115,6 +119,7 @@
     param : [x]
   kernel :
     func : asin_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : asinh_grad
   forward : asinh (Tensor x) -> Tensor(out)
@@ -125,6 +130,7 @@
     param : [x]
   kernel :
     func : asinh_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : assign_grad
   forward : assign (Tensor x) -> Tensor(out)
@@ -134,6 +140,7 @@
     func : UnchangedInferMeta
   kernel :
     func : assign
+  inplace : (out_grad -> x_grad)
 
 - backward_api : assign_out__grad
   forward : assign_out_ (Tensor x, Tensor output) -> Tensor(out)
@@ -143,6 +150,7 @@
     func : UnchangedInferMeta
   kernel :
     func : assign
+  inplace : (out_grad -> x_grad)
 
 - backward_api : atan2_grad
   forward : atan2 (Tensor x, Tensor y) -> Tensor(out)
@@ -163,6 +171,7 @@
     param : [x]
   kernel :
     func : atan_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : atanh_grad
   forward : atanh (Tensor x) -> Tensor(out)
@@ -173,6 +182,7 @@
     param : [x]
   kernel :
     func : atanh_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : batch_norm_double_grad
   forward : batch_norm_grad (Tensor x, Tensor scale, Tensor bias, Tensor out_mean, Tensor out_variance, Tensor saved_mean, Tensor saved_variance, Tensor reserve_space, Tensor grad_out, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) -> Tensor(grad_x), Tensor(grad_scale), Tensor(grad_bias)
@@ -185,6 +195,7 @@
     func : batch_norm_grad_grad
     data_type : x
   optional : out_mean, out_variance
+  inplace : (grad_out -> grad_out_grad)
 
 - backward_api : batch_norm_grad
   forward : batch_norm (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) -> Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space)
@@ -208,6 +219,7 @@
     param : [input]
   kernel :
     func : bce_loss_grad
+  inplace : (out_grad -> input_grad)
 
 - backward_api : brelu_grad
   forward : brelu (Tensor x, float t_min, float t_max) -> Tensor(out)
@@ -218,6 +230,7 @@
     param : [x]
   kernel :
     func : brelu_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : cast_grad
   forward : cast (Tensor x, DataType out_dtype) -> Tensor(out)
@@ -240,6 +253,7 @@
     param: [out_grad]
   kernel :
     func : ceil_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : celu_double_grad
   forward : celu_grad(Tensor x, Tensor grad_out, float alpha) -> Tensor(grad_x)
@@ -250,6 +264,7 @@
     param : [x, x]
   kernel :
     func : celu_double_grad
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : celu_grad
   forward : celu(Tensor x, float alpha) -> Tensor(out)
@@ -261,6 +276,7 @@
   kernel :
     func : celu_grad
   backward : celu_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : cholesky_grad
   forward : cholesky (Tensor x, bool upper) -> Tensor(out)
@@ -302,6 +318,7 @@
   kernel :
     func : clip_grad
   backward : clip_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : concat_double_grad
   forward : concat_grad (Tensor[] x, Tensor grad_out, Scalar axis) -> Tensor[](grad_x)
@@ -375,6 +392,25 @@
     use_gpudnn : true
   backward : conv2d_transpose_double_grad
 
+- backward_api : conv3d_grad
+  forward : conv3d (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) -> Tensor(out)
+  args : (Tensor input, Tensor filter, Tensor out_grad,  int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search)
+  output : Tensor(input_grad), Tensor(filter_grad)
+  invoke : conv3d_grad_impl(input, filter, out_grad,  strides, paddings, paddding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search, input_grad, filter_grad)
+  backward : conv3d_grad_grad
+
+- backward_api : conv3d_grad_grad
+  forward : conv3d_grad (Tensor input, Tensor filter, Tensor grad_out,  int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) -> Tensor(grad_input), Tensor(grad_filter)
+  args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search)
+  output : Tensor(input_grad), Tensor(filter_grad), Tensor(grad_out_grad)
+  infer_meta :
+    func : GeneralTernaryGradInferMeta
+    param: [input, filter, grad_out]
+  kernel :
+    func : conv3d_grad_grad
+    use_gpudnn : true
+  optional : grad_input_grad, grad_filter_grad
+
 - backward_api : conv3d_transpose_grad
   forward : conv3d_transpose(Tensor x, Tensor filter, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format) -> Tensor(out)
   args : (Tensor x, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format)
@@ -394,6 +430,7 @@
     param : [x]
   kernel :
     func : cos_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : cosh_grad
   forward : cosh (Tensor x) -> Tensor(out)
@@ -404,6 +441,7 @@
     param : [x]
   kernel :
     func : cosh_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : cross_entropy_with_softmax_grad
   forward : cross_entropy_with_softmax (Tensor input, Tensor label, bool soft_label, bool use_softmax, bool numeric_stable_mode, int ignore_index, int axis) -> Tensor(softmax), Tensor(loss)
@@ -456,6 +494,25 @@
     data_type : x
   optional : mask
 
+- backward_api : depthwise_conv2d_grad
+  forward : depthwise_conv2d (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu) -> Tensor(out)
+  args : (Tensor input, Tensor filter, Tensor out_grad,  int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu)
+  output : Tensor(input_grad), Tensor(filter_grad)
+  invoke : conv2d_grad_impl(input, filter, out_grad,  strides, paddings, paddding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search, input_grad, filter_grad)
+  backward : depthwise_conv2d_grad_grad
+
+- backward_api : depthwise_conv2d_grad_grad
+  forward : depthwise_conv2d_grad (Tensor input, Tensor filter, Tensor grad_out,  int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu) -> Tensor(grad_input), Tensor(grad_filter)
+  args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search)
+  output : Tensor(input_grad), Tensor(filter_grad), Tensor(grad_out_grad)
+  infer_meta :
+    func : GeneralTernaryGradInferMeta
+    param: [input, filter, grad_out]
+  kernel :
+    func : conv2d_grad_grad
+    use_gpudnn : true
+  optional : grad_input_grad, grad_filter_grad
+
 - backward_api : depthwise_conv2d_transpose_grad
   forward : depthwise_conv2d_transpose(Tensor x, Tensor filter, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format) -> Tensor(out)
   args : (Tensor x, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format)
@@ -517,6 +574,7 @@
     func : divide_double_grad
     data_type : out
   optional : grad_x_grad, grad_y_grad
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : divide_grad
   forward : divide (Tensor x, Tensor y) -> Tensor(out)
@@ -581,6 +639,7 @@
     param : [x, x]
   kernel :
     func : elu_double_grad
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : elu_grad
   forward : elu (Tensor x, float alpha) -> Tensor(out)
@@ -592,6 +651,13 @@
   kernel :
     func : elu_grad
   backward : elu_double_grad
+  inplace : (out_grad -> x_grad)
+
+- backward_api : embedding_grad
+  forward : embedding (Tensor x, Tensor weight, int64_t padding_idx=-1, bool sparse=false) -> Tensor(out)
+  args : (Tensor x, Tensor weight, Tensor out_grad, int64_t padding_idx=-1, bool sparse=false)
+  output : Tensor(weight_grad)
+  invoke : embedding_grad_impl(x, weight, out_grad, padding_idx, sparse, weight_grad)
 
 - backward_api : erf_grad
   forward : erf (Tensor x) -> Tensor(out)
@@ -623,6 +689,7 @@
     param : [out]
   kernel :
     func : exp_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : expand_as_grad
   forward : expand_as (Tensor x, Tensor y, int[] target_shape) -> Tensor(out)
@@ -665,6 +732,7 @@
     param : [out]
   kernel :
     func : expm1_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : flatten_grad
   forward : flatten(Tensor x, int start_axis, int stop_axis) -> Tensor(out), Tensor(xshape)
@@ -699,6 +767,7 @@
     param: [out_grad]
   kernel :
     func : floor_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : fmax_grad
   forward : fmax(Tensor x, Tensor y, int axis) -> Tensor(out)
@@ -794,6 +863,7 @@
     param : [x]
   kernel :
     func : hard_shrink_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : hard_sigmoid_grad
   forward : hard_sigmoid (Tensor x, float slope, float offset) -> Tensor(out)
@@ -804,6 +874,7 @@
     param : [out]
   kernel :
     func : hard_sigmoid_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : hard_swish_grad
   forward : hard_swish (Tensor x, float threshold = 6.0, float scale = 6.0, float offset = 3.0) -> Tensor(out)
@@ -814,6 +885,7 @@
     param : [x]
   kernel :
     func : hard_swish_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : huber_loss_grad
   forward : huber_loss (Tensor input, Tensor label, float delta) -> Tensor(out), Tensor(residual)
@@ -855,6 +927,29 @@
     data_type : x
   no_need_buffer : x
 
+- backward_api : instance_norm_double_grad
+  forward : instance_norm_grad(Tensor x, Tensor fwd_scale, Tensor saved_mean, Tensor saved_variance, Tensor grad_y, float epsilon) -> Tensor(grad_x), Tensor(grad_scale), Tensor(grad_bias)
+  args : (Tensor x, Tensor fwd_scale, Tensor saved_mean, Tensor saved_variance, Tensor grad_y, Tensor grad_x_grad, Tensor grad_scale_grad, Tensor grad_bias_grad, float epsilon)
+  output : Tensor(x_grad), Tensor(fwd_scale_grad), Tensor(grad_y_grad)
+  infer_meta :
+    func : InstanceNormDoubleGradInferMeta
+  kernel :
+    func : instance_norm_double_grad
+    data_type : x
+  optional : fwd_scale, grad_x_grad, grad_scale_grad, grad_bias_grad
+
+- backward_api : instance_norm_grad
+  forward : instance_norm(Tensor x, Tensor scale, Tensor bias, float epsilon) -> Tensor(y), Tensor(saved_mean), Tensor(saved_variance)
+  args : (Tensor x, Tensor scale, Tensor saved_mean, Tensor saved_variance, Tensor y_grad, float epsilon)
+  output : Tensor(x_grad), Tensor(scale_grad), Tensor(bias_grad)
+  infer_meta :
+    func : InstanceNormGradInferMeta
+  kernel :
+    func : instance_norm_grad
+    data_type : x
+  optional : scale
+  backward : instance_norm_double_grad
+
 - backward_api : kldiv_loss_grad
   forward : kldiv_loss(Tensor x, Tensor label, str reduction) -> Tensor(out)
   args : (Tensor x, Tensor label, Tensor out_grad, str reduction)
@@ -919,6 +1014,7 @@
     param : [grad_x_grad]
   kernel :
     func : leaky_relu_double_grad
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : leaky_relu_grad
   forward : leaky_relu (Tensor x, float alpha) -> Tensor(out)
@@ -930,6 +1026,7 @@
   kernel :
     func : leaky_relu_grad
   backward : leaky_relu_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : lerp_grad
   forward : lerp (Tensor x, Tensor y, Tensor weight) -> Tensor(out)
@@ -960,6 +1057,7 @@
     param : [x]
   kernel :
     func : log10_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : log1p_grad
   forward : log1p (Tensor x) -> Tensor(out)
@@ -970,6 +1068,7 @@
     param : [x]
   kernel :
     func : log1p_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : log2_grad
   forward : log2 (Tensor x) -> Tensor(out)
@@ -980,6 +1079,7 @@
     param : [x]
   kernel :
     func : log2_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : log_double_grad
   forward : log_grad (Tensor x, Tensor grad_out) -> Tensor(grad_x)
@@ -990,6 +1090,7 @@
     param : [x, x]
   kernel :
     func : log_double_grad
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : log_grad
   forward : log (Tensor x) -> Tensor(out)
@@ -1001,6 +1102,7 @@
   kernel :
     func : log_grad
   backward : log_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : log_loss_grad
   forward : log_loss (Tensor input, Tensor label, float epsilon) -> Tensor(out)
@@ -1041,6 +1143,7 @@
     param : [x]
   kernel :
     func : logsigmoid_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : logsumexp_grad
   forward : logsumexp(Tensor x, int64_t[] axis,  bool keepdim,  bool reduce_all) -> Tensor(out)
@@ -1222,6 +1325,7 @@
     param : [x]
   kernel :
     func : mish_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : mode_grad
   forward : mode(Tensor x,  int axis,  bool keepdim) -> Tensor(out), Tensor(indices)
@@ -1275,6 +1379,7 @@
     func : multiply_double_grad
   optional : grad_x_grad, grad_y_grad
   backward : multiply_triple_grad
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : multiply_grad
   forward : multiply (Tensor x, Tensor y) -> Tensor(out)
@@ -1451,6 +1556,7 @@
     param: [x]
   kernel :
     func : pow_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : prelu_grad
   forward : prelu(Tensor x, Tensor alpha, str data_format, str mode) -> Tensor(out)
@@ -1500,6 +1606,7 @@
     param : [out]
   kernel :
     func : reciprocal_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : reduce_prod_grad
   forward : reduce_prod (Tensor x, int64_t[] dims, bool keep_dim, bool reduce_all) -> Tensor(out)
@@ -1520,6 +1627,7 @@
     param : [out]
   kernel :
     func : relu_double_grad
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : relu_grad
   forward : relu (Tensor x) -> Tensor(out)
@@ -1531,6 +1639,7 @@
   kernel :
     func : relu_grad
   backward: relu_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : reshape_double_grad
   forward : reshape_grad (Tensor xshape, Tensor grad_out) -> Tensor(grad_x)
@@ -1542,6 +1651,7 @@
   kernel :
     func : reshape_double_grad
   no_need_buffer : grad_out
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : reshape_grad
   forward : reshape (Tensor x, IntArray shape) -> Tensor(out), Tensor(xshape)
@@ -1605,6 +1715,7 @@
     param: [out_grad]
   kernel :
     func : round_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : rsqrt_double_grad
   forward : rsqrt_grad (Tensor out, Tensor grad_out) -> Tensor(grad_x)
@@ -1615,6 +1726,7 @@
     param : [out, out]
   kernel :
     func : rsqrt_double_grad
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : rsqrt_grad
   forward : rsqrt (Tensor x) -> Tensor(out)
@@ -1626,6 +1738,7 @@
   kernel :
     func : rsqrt_grad
   backward : rsqrt_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : scale_double_grad
   forward : scale_grad (Tensor grad_out, Scalar scale, float bias, bool bias_after_scale) -> Tensor(grad_x)
@@ -1701,6 +1814,7 @@
     param : [x]
   kernel :
     func : sigmoid_cross_entropy_with_logits_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : sigmoid_double_grad
   forward : sigmoid_grad (Tensor out, Tensor fwd_grad_out) -> Tensor(grad_x)
@@ -1712,6 +1826,7 @@
   kernel :
     func : sigmoid_double_grad
   backward : sigmoid_triple_grad
+  inplace : (grad_x_grad -> fwd_grad_out_grad)
 
 - backward_api : sigmoid_grad
   forward : sigmoid (Tensor x) -> Tensor(out)
@@ -1723,6 +1838,7 @@
   kernel :
     func : sigmoid_grad
   backward : sigmoid_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : sigmoid_triple_grad
   forward : sigmoid_double_grad (Tensor out, Tensor fwd_grad_out, Tensor grad_grad_x) -> Tensor(grad_out), Tensor(grad_grad_out)
@@ -1734,6 +1850,7 @@
   kernel :
     func : sigmoid_triple_grad
   optional : grad_grad_out_grad
+  inplace : (grad_grad_x -> fwd_grad_out_grad)
 
 - backward_api : silu_grad
   forward : silu (Tensor x) -> Tensor(out)
@@ -1744,6 +1861,7 @@
     param : [x]
   kernel :
     func : silu_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : sin_grad
   forward : sin (Tensor x) -> Tensor(out)
@@ -1754,6 +1872,7 @@
     param : [x]
   kernel :
     func : sin_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : sinh_grad
   forward : sinh (Tensor x) -> Tensor(out)
@@ -1764,6 +1883,7 @@
     param : [x]
   kernel :
     func : sinh_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : slice_grad
   forward : slice (Tensor input, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis) -> Tensor(out)
@@ -1785,6 +1905,7 @@
     param : [x]
   kernel :
     func : soft_shrink_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : softmax_grad
   forward : softmax (Tensor x, int axis) -> Tensor(out)
@@ -1813,6 +1934,7 @@
     param : [out, out]
   kernel :
     func : sqrt_double_grad
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : sqrt_grad
   forward : sqrt (Tensor x) -> Tensor(out)
@@ -1824,6 +1946,7 @@
   kernel :
     func : sqrt_grad
   backward : sqrt_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : square_double_grad
   forward : square_grad (Tensor x, Tensor grad_out) -> Tensor(grad_x)
@@ -1834,6 +1957,7 @@
     param : [x, x]
   kernel :
     func : square_double_grad
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : square_grad
   forward : square (Tensor x) -> Tensor(out)
@@ -1845,6 +1969,7 @@
   kernel :
     func : square_grad
   backward : square_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : squeeze_double_grad
   forward : squeeze_grad(Tensor xshape, Tensor grad_out, int[] axes) -> Tensor(grad_x)
@@ -1898,6 +2023,7 @@
     func : subtract_double_grad
   optional : grad_x_grad, grad_y_grad
   no_need_buffer : y, grad_out
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : subtract_grad
   forward : subtract (Tensor x, Tensor y) -> Tensor(out)
@@ -1946,6 +2072,7 @@
     param : [x]
   kernel :
     func : swish_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : take_along_axis_grad
   forward : take_along_axis (Tensor x, Tensor index, int axis) -> Tensor(out)
@@ -1966,6 +2093,7 @@
     param : [x]
   kernel :
     func : tan_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : tanh_double_grad
   forward : tanh_grad (Tensor out, Tensor grad_out) -> Tensor(grad_x)
@@ -1977,6 +2105,7 @@
   kernel :
     func : tanh_double_grad
   backward : tanh_triple_grad
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : tanh_grad
   forward : tanh (Tensor x) -> Tensor(out)
@@ -1988,6 +2117,7 @@
   kernel :
     func : tanh_grad
   backward : tanh_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : tanh_shrink_grad
   forward : tanh_shrink (Tensor x) -> Tensor(out)
@@ -1998,6 +2128,7 @@
     param : [x]
   kernel :
     func : tanh_shrink_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : tanh_triple_grad
   forward : tanh_double_grad (Tensor out, Tensor grad_out_forward, Tensor grad_x_grad_forward) -> Tensor(grad_out_new), Tensor(grad_out_grad)
@@ -2008,6 +2139,7 @@
     param : [out, out, grad_x_grad_forward]
   kernel :
     func : tanh_triple_grad
+  inplace : (grad_x_grad_forward -> grad_out_forward_grad)
 
 - backward_api : thresholded_relu_grad
   forward : thresholded_relu (Tensor x, float threshold) -> Tensor(out)
@@ -2018,6 +2150,7 @@
     param : [x]
   kernel :
     func : thresholded_relu_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : tile_double_grad
   forward : tile_grad (Tensor x, Tensor grad_out, IntArray repeat_times) -> Tensor(grad_x)
diff --git a/python/paddle/vision/transforms/functional.py b/python/paddle/vision/transforms/functional.py
index 90fba1c4130e5..7927e9faee370 100644
--- a/python/paddle/vision/transforms/functional.py
+++ b/python/paddle/vision/transforms/functional.py
@@ -380,6 +380,7 @@ def adjust_brightness(img, brightness_factor):
 
     Examples:
         .. code-block:: python
+           :name: code-example1
 
             import numpy as np
             from PIL import Image
@@ -388,9 +389,13 @@ def adjust_brightness(img, brightness_factor):
             fake_img = (np.random.rand(256, 300, 3) * 255.).astype('uint8')
 
             fake_img = Image.fromarray(fake_img)
+            print(fake_img.size) # (300, 256)
+            print(fake_img.load()[1,1]) # (95, 127, 202)
+            converted_img = F.adjust_brightness(fake_img, 0.5)
+            print(converted_img.size) # (300, 256)
+            print(converted_img.load()[1,1]) # (47, 63, 101)
+
 
-            converted_img = F.adjust_brightness(fake_img, 0.4)
-            print(converted_img.size)
     """
     if not (_is_pil_image(img) or _is_numpy_image(img) or
             _is_tensor_image(img)):
diff --git a/python/paddle/vision/transforms/transforms.py b/python/paddle/vision/transforms/transforms.py
index fea2efb1fb2b1..31f56e890558c 100644
--- a/python/paddle/vision/transforms/transforms.py
+++ b/python/paddle/vision/transforms/transforms.py
@@ -1042,14 +1042,32 @@ class RandomCrop(BaseTransform):
         size (sequence|int): Desired output size of the crop. If size is an
             int instead of sequence like (h, w), a square crop (size, size) is
             made.
-        padding (int|sequence|optional): Optional padding on each border
+        padding (int|sequence, optional): Optional padding on each border
             of the image. If a sequence of length 4 is provided, it is used to pad left, 
-            top, right, bottom borders respectively. Default: 0.
-        pad_if_needed (boolean|optional): It will pad the image if smaller than the
+            top, right, bottom borders respectively. Default: None, without padding.
+        pad_if_needed (boolean, optional): It will pad the image if smaller than the
             desired size to avoid raising an exception. Default: False.
+        fill (float|tuple, optional): Pixel fill value for constant fill. If a tuple of
+            length 3, it is used to fill R, G, B channels respectively.
+            This value is only used when the padding_mode is constant. Default: 0.
+        padding_mode: Type of padding. Should be: constant, edge, reflect or symmetric. Default: 'constant'.
+
+            - constant: pads with a constant value, this value is specified with fill
+
+            - edge: pads with the last value on the edge of the image
+
+            - reflect: pads with reflection of image (without repeating the last value on the edge)
+
+                   padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
+                   will result in [3, 2, 1, 2, 3, 4, 3, 2]
+
+            - symmetric: pads with reflection of image (repeating the last value on the edge)
+
+                     padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
+                     will result in [2, 1, 1, 2, 3, 4, 4, 3]
         keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
     
-    Shape:
+    Shape
         - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
         - output(PIL.Image|np.ndarray|Paddle.Tensor): A random cropped image.
 
@@ -1059,17 +1077,17 @@ class RandomCrop(BaseTransform):
     Examples:
     
         .. code-block:: python
+          :name: code-example1
 
-            import numpy as np
-            from PIL import Image
+            import paddle
             from paddle.vision.transforms import RandomCrop
-
             transform = RandomCrop(224)
 
-            fake_img = Image.fromarray((np.random.rand(324, 300, 3) * 255.).astype(np.uint8))
+            fake_img = paddle.randint(0, 255, shape=(3, 324,300), dtype = 'int32')
+            print(fake_img.shape) # [3, 324, 300]
 
-            fake_img = transform(fake_img)
-            print(fake_img.size)
+            crop_img = transform(fake_img)
+            print(crop_img.shape) # [3, 224, 224]
     """
 
     def __init__(self,
diff --git a/python/setup.py.in b/python/setup.py.in
index 2a0d745729aab..ca1768c9462f0 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -374,6 +374,10 @@ packages=['paddle',
           'paddle.incubate.distributed.models',
           'paddle.incubate.distributed.models.moe',
           'paddle.incubate.distributed.models.moe.gate',
+          'paddle.incubate.sparse',
+          'paddle.incubate.sparse.nn',
+          'paddle.incubate.sparse.nn.layer',
+          'paddle.incubate.sparse.nn.functional',
           'paddle.io',
           'paddle.optimizer',
           'paddle.nn',
@@ -394,9 +398,6 @@ packages=['paddle',
           'paddle.device.cuda',
           'paddle.version',
           'paddle.profiler',
-          'paddle.sparse',
-          'paddle.sparse.layer',
-          'paddle.sparse.functional',
           ]
 
 with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
diff --git a/tools/codestyle/pylint_pre_commit.hook b/tools/codestyle/pylint_pre_commit.hook
index 150a3f5666bd3..f0d2f0a22d79e 100755
--- a/tools/codestyle/pylint_pre_commit.hook
+++ b/tools/codestyle/pylint_pre_commit.hook
@@ -6,6 +6,13 @@ TOTAL_ERRORS=0
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 export PYTHONPATH=$DIR:$PYTHONPATH
 
+readonly VERSION="2.12.0"
+version=$(pylint --version | grep 'pylint')
+
+if ! [[ $version == *"$VERSION"* ]]; then
+    pip install pylint==2.12.0 1>nul
+fi
+
 # The trick to remove deleted files: https://stackoverflow.com/a/2413151
 for file in $(git diff --name-status | awk '$1 != "D" {print $2}'); do
     pylint --disable=all --load-plugins=docstring_checker \
@@ -16,4 +23,3 @@ done
 exit $TOTAL_ERRORS
 #For now, just warning:
 #exit 0
-
diff --git a/tools/dockerfile/ci_dockerfile.sh b/tools/dockerfile/ci_dockerfile.sh
index ed13ca8762500..485bfd7968f05 100644
--- a/tools/dockerfile/ci_dockerfile.sh
+++ b/tools/dockerfile/ci_dockerfile.sh
@@ -20,7 +20,7 @@ function make_ubuntu_dockerfile(){
   sed -i "s#liblzma-dev#liblzma-dev openmpi-bin openmpi-doc libopenmpi-dev#g" ${dockerfile_name} 
   dockerfile_line=$(wc -l ${dockerfile_name}|awk '{print $1}')
   sed -i "${dockerfile_line}i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz \&\& \
-     tar -xzf  hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name} 
+     tar -xzf hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name} 
   sed -i "${dockerfile_line}i RUN apt remove git -y \&\& apt install -y libcurl4-openssl-dev gettext zstd \&\& wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz \&\& \
     tar -xvf git-2.17.1.tar.gz \&\& \
     cd git-2.17.1 \&\& \
@@ -38,7 +38,7 @@ function make_ubuntu_dockerfile(){
     ENV PATH=/usr/local/gcc-8.2/bin:\$PATH #g" ${dockerfile_name}
   sed -i "s#bash /build_scripts/install_nccl2.sh#wget -q --no-proxy https://nccl2-deb.cdn.bcebos.com/nccl-repo-ubuntu1604-2.7.8-ga-cuda10.1_1-1_amd64.deb \\
     RUN dpkg -i nccl-repo-ubuntu1604-2.7.8-ga-cuda10.1_1-1_amd64.deb \\
-    RUN apt update \&\& apt remove -y libnccl* --allow-change-held-packages \&\&  apt-get install -y libnccl2=2.7.8-1+cuda10.1 libnccl-dev=2.7.8-1+cuda10.1 pigz --allow-change-held-packages #g" ${dockerfile_name}
+    RUN apt remove -y libnccl* --allow-change-held-packages \&\&  apt-get install -y libnccl2=2.7.8-1+cuda10.1 libnccl-dev=2.7.8-1+cuda10.1 pigz --allow-change-held-packages #g" ${dockerfile_name}
 }
 
 
diff --git a/tools/final_ut_parallel_rule.py b/tools/final_ut_parallel_rule.py
new file mode 100644
index 0000000000000..09ba48f0d43fc
--- /dev/null
+++ b/tools/final_ut_parallel_rule.py
@@ -0,0 +1,153 @@
+# -*- coding: utf-8 -*-
+
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+import json
+import datetime
+import codecs
+import sys
+
+
+def classify_cases_by_mem(rootPath):
+    """classify cases by mem"""
+    case_filename = '%s/build/classify_case_by_cardNum.txt' % rootPath
+    case_exec_100 = [
+        'test_conv_eltwiseadd_bn_fuse_pass', 'test_trt_convert_pool2d',
+        'test_fc_fuse_pass', 'test_trt_convert_depthwise_conv2d',
+        'test_quant2_int8_resnet50_mkldnn',
+        'test_conv_elementwise_add_act_fuse_pass', 'test_trt_convert_conv2d',
+        'test_paddle_save_load', 'test_logical_op', 'test_nearest_interp_op',
+        'test_pool2d_op', 'test_conv3d_transpose_op', 'test_lstmp_op',
+        'test_cross_entropy2_op', 'test_sgd_op', 'test_imperative_ptq',
+        'test_model', 'test_custom_relu_op_setup', 'test_dropout_op',
+        'test_concat_op'
+    ]  #木桶原理 70s-100s之间的case
+
+    case_exec_200 = [
+        'test_post_training_quantization_mnist',
+        'test_imperative_auto_mixed_precision',
+        'test_trt_dynamic_shape_ernie_fp16_ser_deser',
+        'test_trt_dynamic_shape_ernie', 'test_layer_norm_op',
+        'trt_quant_int8_yolov3_r50_test', 'test_gru_op',
+        'test_post_training_quantization_while', 'test_mkldnn_log_softmax_op',
+        'test_mkldnn_matmulv2_op', 'test_mkldnn_shape_op',
+        'interceptor_pipeline_short_path_test',
+        'interceptor_pipeline_long_path_test', 'test_cpuonly_spawn'
+    ]  #木桶原理 110s-200s之间的case 以及容易timeout
+
+    case_always_timeout = [
+        'test_quant2_int8_resnet50_channelwise_mkldnn',
+        'test_parallel_dygraph_unused_variables_gloo',
+        'test_seq2seq',
+        'test_pool3d_op',
+        'test_trilinear_interp_op',
+        'test_trilinear_interp_v2_op',
+        'test_dropout_op',
+        'test_parallel_dygraph_sync_batch_norm',
+        'test_conv3d_op',
+        'test_quant2_int8_resnet50_range_mkldnn',
+    ]  # always timeout 
+
+    f = open(case_filename)
+    lines = f.readlines()
+    all_tests_by_card = {}
+    for line in lines:
+        if line.startswith('single_card_tests:'):
+            all_tests_by_card['single_card_tests'] = []
+            line = line.split('single_card_tests: ^job$|')[1].split('|')
+            for case in line:
+                case = case.replace('^', '').replace('$', '').strip()
+                all_tests_by_card['single_card_tests'].append(case)
+        elif line.startswith('multiple_card_tests:'):
+            all_tests_by_card['multiple_card_tests'] = []
+            line = line.split('multiple_card_tests: ^job$|')[1].split('|')
+            for case in line:
+                case = case.replace('^', '').replace('$', '').strip()
+                all_tests_by_card['multiple_card_tests'].append(case)
+        elif line.startswith('exclusive_card_tests:'):
+            all_tests_by_card['exclusive_card_tests'] = []
+            line = line.split('exclusive_card_tests: ^job$')[1].split('|')
+            for case in line:
+                case = case.replace('^', '').replace('$', '').strip()
+                all_tests_by_card['exclusive_card_tests'].append(case)
+
+    with open("/pre_test/classify_case_by_cardNum.json", "w") as f:
+        json.dump(all_tests_by_card, f)
+
+    with open("/pre_test/ut_mem_map.json", 'r') as load_f:
+        new_lastest_mem = json.load(load_f)
+    no_parallel_case = '^job$'
+    for cardType in all_tests_by_card:
+        case_mem_0 = '^job$'
+        case_mem_1 = {}
+        for case in all_tests_by_card[cardType]:
+            if case in case_exec_100 or case in case_exec_200:
+                continue
+            if case in case_always_timeout:
+                no_parallel_case = no_parallel_case + '|^' + case + '$'
+            if case not in new_lastest_mem:
+                continue
+
+            #mem = 0
+            if new_lastest_mem[case]["mem_nvidia"] == 0:
+                case_mem_0 = case_mem_0 + '|^' + case + '$'
+            #mem != 0
+            else:
+                case_mem_1[case] = new_lastest_mem[case]["mem_nvidia"]
+
+        with open('/pre_test/%s_mem0' % cardType, 'w') as f:
+            f.write(case_mem_0)
+            f.close()
+
+        case_mem_1_sort = sorted(case_mem_1.items(), key=lambda x: x[1])
+        case_mem_1_line = '^job$'
+        mem_1_sum = 0
+        with open('/pre_test/%s' % cardType, 'w') as f_not_0:
+            for index in case_mem_1_sort:
+                if mem_1_sum < 16 * 1024 * 2:
+                    mem_1_sum += index[1]
+                    case_mem_1_line = case_mem_1_line + '|^' + index[0] + '$'
+                else:
+                    f_not_0.write(case_mem_1_line + '\n')
+                    '''
+                    if len(always_timeout_list
+                           ) != 0 and cardType == 'single_card_tests' and count > 25:
+                        f.write(case_mem_1_line + '|^%s$\n' %
+                                always_timeout_list[0])
+                        always_timeout_list.pop(0)
+                    else:
+                        f.write(case_mem_1_line + '\n') 
+                    count += 1
+                    '''
+                    case_mem_1_line = '^job$|^' + index[0] + '$'
+                    mem_1_sum = index[1]
+            f_not_0.write(case_mem_1_line + '\n')
+
+            if cardType == 'single_card_tests':
+                for cases in [case_exec_100, case_exec_200]:
+                    case_mem_1_line = '^job$'
+                    for case in cases:
+                        case_mem_1_line = case_mem_1_line + '|^' + case + '$'
+                    f_not_0.write(case_mem_1_line + '\n')
+            f_not_0.close()
+
+    os.system('cp %s/build/nightly_case /pre_test/' % rootPath)
+
+
+if __name__ == '__main__':
+    rootPath = sys.argv[1]
+    classify_cases_by_mem(rootPath)
diff --git a/tools/get_ut_mem_map.py b/tools/get_ut_mem_map.py
index 745d7f9a90c24..37d167693c7a6 100644
--- a/tools/get_ut_mem_map.py
+++ b/tools/get_ut_mem_map.py
@@ -14,6 +14,7 @@
 
 import os
 import json
+import sys
 
 
 def get_ut_mem(rootPath):
@@ -24,7 +25,7 @@ def get_ut_mem(rootPath):
                 continue
             ut = f.replace('^', '').replace('$.log', '')
             case_dic[ut] = {}
-            filename = '%s%s' % (parent, f)
+            filename = '%s/%s' % (parent, f)
             fi = open(filename)
             lines = fi.readlines()
             mem_reserved1 = -1
@@ -56,7 +57,7 @@ def get_ut_mem(rootPath):
             if caseTime != -1:
                 case_dic[ut]['time'] = caseTime
 
-    ut_mem_map_file = "/pre_test/ut_mem_map.json" % rootPath
+    ut_mem_map_file = "/pre_test/ut_mem_map.json"
     with open(ut_mem_map_file, "w") as f:
         json.dump(case_dic, f)
 
diff --git a/tools/group_case_for_parallel.py b/tools/group_case_for_parallel.py
new file mode 100644
index 0000000000000..b8aab3a3fe6e6
--- /dev/null
+++ b/tools/group_case_for_parallel.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+
+
+def group_case_for_parallel(rootPath):
+    """group cases"""
+
+    #wget file
+    for filename in [
+            'nightly_tests', 'single_card_tests', 'single_card_tests_mem0',
+            'multiple_card_tests', 'multiple_card_tests_mem0',
+            'exclusive_card_tests', 'exclusive_card_tests_mem0'
+    ]:
+        os.system(
+            'cd %s/tools && wget --no-proxy https://paddle-docker-tar.bj.bcebos.com/pre_test/%s --no-check-certificate'
+            % (rootPath, filename))
+
+    #get nightly tests
+    nightly_tests_file = open('%s/tools/nightly_tests' % rootPath, 'r')
+    nightly_tests = nightly_tests_file.read().strip().split('\n')
+    nightly_tests_file.close()
+
+    parallel_case_file_list = [
+        '%s/tools/single_card_tests_mem0' % rootPath,
+        '%s/tools/single_card_tests' % rootPath,
+        '%s/tools/multiple_card_tests_mem0' % rootPath,
+        '%s/tools/multiple_card_tests' % rootPath,
+        '%s/tools/exclusive_card_tests_mem0' % rootPath,
+        '%s/tools/exclusive_card_tests' % rootPath
+    ]
+    case_file = '%s/build/ut_list' % rootPath
+    if os.path.exists(case_file):
+        f = open(case_file, 'r')
+        all_need_run_cases = f.read().strip().split('\n')
+        if len(all_need_run_cases) == 1 and all_need_run_cases[0] == '':
+            f.close()
+            case_file = '%s/build/all_ut_list' % rootPath
+            f = open(case_file, 'r')
+            all_need_run_cases = f.read().strip().split('\n')
+    else:
+        case_file = '%s/build/all_ut_list' % rootPath
+        f = open(case_file, 'r')
+        all_need_run_cases = f.read().strip().split('\n')
+
+    print("case_file: %s" % case_file)
+
+    all_group_case = []
+    for filename in parallel_case_file_list:
+        fi = open(filename, 'r')
+        new_f = open('%s_new' % filename, 'w')
+        lines = fi.readlines()
+        new_case_file_list = []
+        for line in lines:
+            case_line_list = line.replace('^', '').replace('|', '').split('$')
+            new_case_line_list = list(
+                set(all_need_run_cases).intersection(set(case_line_list)))
+            if len(new_case_line_list) != 0:
+                new_case_file_list.append(new_case_line_list)
+                all_group_case += new_case_line_list
+                all_need_run_cases = list(
+                    set(all_need_run_cases).difference(set(all_group_case)))
+
+        for line in new_case_file_list:
+            cases = '$|^'.join(case for case in line)
+            cases = '^job$|^%s$' % cases
+            new_f.write(cases + '\n')
+        fi.close()
+        new_f.close()
+
+    #no parallel cases
+    cases = '^job'
+    if len(all_need_run_cases) != 0:
+        for case in all_need_run_cases:
+            if case not in nightly_tests:
+                cases = cases + '$|^%s' % case
+        cases = '%s$' % cases
+
+    new_f = open('%s/tools/no_parallel_case_file' % rootPath, 'w')
+    new_f.write(cases + '\n')
+    new_f.close()
+    f.close()
+
+
+if __name__ == "__main__":
+    rootPath = sys.argv[1]
+    group_case_for_parallel(rootPath)
diff --git a/tools/infrt/skipped_phi_api.json b/tools/infrt/skipped_phi_api.json
index 2502e248c5c48..75533311513e5 100644
--- a/tools/infrt/skipped_phi_api.json
+++ b/tools/infrt/skipped_phi_api.json
@@ -1,4 +1,4 @@
 {
-"phi_apis":["conj", "deformable_conv", "dropout", "expand_as", "nll_loss", "psroi_pool", "roi_align", "roi_pool", "label_smooth", "layer_norm"],
+"phi_apis":["conj", "deformable_conv", "dropout", "expand_as", "nll_loss", "psroi_pool", "roi_align", "roi_pool", "label_smooth", "layer_norm", "instance_norm"],
 "phi_kernels":["equal_all"]
 }
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 5088ad3457fb9..7c43ef1a6d2e3 100755
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -824,7 +824,7 @@
     'test_mean_op', 'test_is_tensor', 'test_run_program_op',
     'test_cuda_random_seed', 'test_linear_interp_op',
     'test_fuse_all_reduce_pass', 'tensor_util_test', 'test_median',
-    'test_linear', 'test_imperative_qat_amp',
+    'test_nanmedian', 'test_linear', 'test_imperative_qat_amp',
     'test_truncated_gaussian_random_op', 'test_lstm_cudnn_op',
     'copy_same_tensor_test', 'test_squeeze2_op',
     'naive_best_fit_allocator_test', 'test_model', 'test_py_reader_combination',
@@ -2047,6 +2047,8 @@
     'test_lambda',
     'test_prod_op',
     'test_fused_attention_op_api',
+    'test_fused_bias_dropout_residual_layer_norm_op',
+    'test_fused_bias_dropout_residual_layer_norm_op_api',
     'test_complex_grad_accumulated',
     'test_deg2rad',
     'test_lgamma_op',
diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py
index 13005350d7bd5..1bd9f029d552c 100644
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -342,7 +342,7 @@ def sampcd_extract_to_file(srccom, name, htype="def", hname=""):
             logger.error(
                 "Error: No sample code found! Please check if the API comment contais string 'Examples:' correctly"
             )
-            exit(1)
+            return []
 
     sample_code_filenames = []
     for y, cb in enumerate(codeblocks):
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index 6067b40f0a7c1..95c5ecf713112 100755
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -399,6 +399,7 @@
     'test_positive_negative_pair_op',
     'test_precision_recall_op',
     'test_prelu_op',
+    'test_rrelu_op',
     'test_prelu_mkldnn_op',
     'test_print_op',
     'test_prior_box_op',