diff --git a/.gitignore b/.gitignore
index 9ced08c2fa095..875de7af1beea 100644
--- a/.gitignore
+++ b/.gitignore
@@ -66,14 +66,14 @@ paddle/infrt/dialect/pd/common/pd_ops_info.h
 paddle/infrt/tests/dialect/Output
 paddle/infrt/tests/lit.cfg.py
 paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.cc
-paddle/fluid/pybind/eager_final_state_op_function.cc
+paddle/fluid/pybind/eager_op_function.cc
 
 # these files (directories) are generated before build system generation
 paddle/fluid/operators/generated_op.cc
 paddle/phi/ops/compat/generated_sig.cc
 paddle/phi/api/yaml/parsed_apis/
 python/paddle/utils/code_gen/
-paddle/fluid/pybind/tmp_eager_final_state_op_function_impl.h
-paddle/fluid/pybind/eager_final_state_op_function_impl.h
+paddle/fluid/pybind/tmp_eager_op_function_impl.h
+paddle/fluid/pybind/eager_op_function_impl.h
 paddle/fluid/pybind/eager_op_function_impl.h
 paddle/fluid/pybind/op_function_impl.h
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 762fd17909b6e..49ccb815c995d 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -364,6 +364,18 @@ if(WIN32)
   endif()
 endif()
 
+if(NOT WITH_TESTING AND WITH_MULTINODE_TESTING)
+  message(
+    WARNING
+      "Disable WITH_MULTINODE_TESTING when compiling without TESTING. Force WITH_MULTINODE_TESTING=OFF."
+  )
+  set(WITH_MULTINODE_TESTING
+      OFF
+      CACHE STRING
+            "Disable WITH_MULTINODE_TESTING when compiling without TESTING"
+            FORCE)
+endif()
+
 if(NOT WITH_GPU AND WITH_NCCL)
   message(
     WARNING "Disable NCCL when compiling without GPU. Force WITH_NCCL=OFF.")
diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt
index 02e57700c9879..9e2ceb29f2a64 100644
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -23,9 +23,10 @@ if(WITH_NCCL OR WITH_RCCL)
   if(WITH_DISTRIBUTE AND WITH_PSCORE)
     if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
       set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
+      set_source_files_properties(
+        ProcessGroupHeter.cc PROPERTIES COMPILE_FLAGS
+                                        ${DISTRIBUTE_COMPILE_FLAGS})
     endif()
-    set_source_files_properties(
-      ProcessGroupHeter.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     cc_library(
       processgroup_heter
       SRCS ProcessGroupHeter.cc NCCLTools.cc Common.cc
@@ -47,9 +48,11 @@ if(WITH_ASCEND_CL)
   if(WITH_DISTRIBUTE AND WITH_PSCORE)
     if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
       set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
+      set_source_files_properties(
+        ProcessGroupHeter.cc PROPERTIES COMPILE_FLAGS
+                                        ${DISTRIBUTE_COMPILE_FLAGS})
     endif()
-    set_source_files_properties(
-      ProcessGroupHeter.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+
     cc_library(
       processgroup_heter
       SRCS ProcessGroupHeter.cc HCCLTools.cc Common.cc
diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt
index 73d8539329a75..e5cfe838c54f3 100644
--- a/paddle/fluid/eager/CMakeLists.txt
+++ b/paddle/fluid/eager/CMakeLists.txt
@@ -41,7 +41,7 @@ if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
     grad_tensor_holder
     SRCS grad_tensor_holder.cc
     DEPS grad_node_info gradient_accumulator)
-  add_dependencies(grad_tensor_holder eager_final_state_codegen)
+  add_dependencies(grad_tensor_holder eager_codegen)
   cc_library(
     backward
     SRCS backward.cc
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
index 1f2b30853c6bf..69bfe4d941572 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
@@ -8,5 +8,5 @@ if(NOT (NOT WITH_PYTHON AND ON_INFER))
     final_dygraph_node
     SRCS nodes.cc ${eager_manual_nodes}
     DEPS ${eager_deps})
-  add_dependencies(final_dygraph_node eager_final_state_codegen)
+  add_dependencies(final_dygraph_node eager_codegen)
 endif()
diff --git a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
index 9baf8956fe2e4..c32dd2f122a6e 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
@@ -8,5 +8,5 @@ if(NOT (NOT WITH_PYTHON AND ON_INFER))
     final_dygraph_function
     SRCS dygraph_functions.cc ${eager_manual_functions}
     DEPS ${eager_deps})
-  add_dependencies(final_dygraph_function eager_final_state_codegen)
+  add_dependencies(final_dygraph_function eager_codegen)
 endif()
diff --git a/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h b/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h
index f9d10600a9a00..49d401b92303e 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h
+++ b/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h
@@ -16,10 +16,10 @@
 
 #include "paddle/phi/api/include/tensor.h"
 
-paddle::experimental::Tensor add_n_final_state_dygraph_function(
+paddle::experimental::Tensor add_n_dygraph_function(
     const std::vector<paddle::experimental::Tensor>& x);
 
-paddle::experimental::Tensor conv2d_final_state_dygraph_function(
+paddle::experimental::Tensor conv2d_dygraph_function(
     const paddle::experimental::Tensor& input,
     const paddle::experimental::Tensor& filter,
     std::vector<int> strides,
diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc
index 226197b0f84ad..3081eaf3584f6 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc
@@ -23,7 +23,7 @@
 #pragma GCC diagnostic ignored "-Wunused-variable"
 DECLARE_bool(check_nan_inf);
 
-paddle::experimental::Tensor add_n_final_state_dygraph_function(
+paddle::experimental::Tensor add_n_dygraph_function(
     const std::vector<paddle::experimental::Tensor>& x) {
   // Dygraph Record Event
   paddle::platform::RecordEvent dygraph_entrance_record_event(
@@ -46,7 +46,7 @@ paddle::experimental::Tensor add_n_final_state_dygraph_function(
       paddle::imperative::AutoCastGuard guard(
           egr::Controller::Instance().GetCurrentTracer(),
           paddle::imperative::AmpLevel::O0);
-      return add_n_final_state_dygraph_function(NEW_x);
+      return add_n_dygraph_function(NEW_x);
     }
   }
 
@@ -56,7 +56,7 @@ paddle::experimental::Tensor add_n_final_state_dygraph_function(
   std::vector<egr::AutogradMeta*>* x_autograd_meta = &x_autograd_meta_vec;
   // Forward API Call
   VLOG(3) << "Final State Running: "
-          << "add_n_final_state_dygraph_function";
+          << "add_n_dygraph_function";
   auto api_result = paddle::experimental::add_n(x);
   // Check NaN and Inf if needed
   if (FLAGS_check_nan_inf) {
diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc
index f7bff6fb88997..ee1bfb17b3e85 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc
@@ -23,7 +23,7 @@
 #pragma GCC diagnostic ignored "-Wunused-variable"
 DECLARE_bool(check_nan_inf);
 
-paddle::experimental::Tensor conv2d_final_state_dygraph_function(
+paddle::experimental::Tensor conv2d_dygraph_function(
     const paddle::experimental::Tensor& input,
     const paddle::experimental::Tensor& filter,
     std::vector<int> strides,
@@ -59,17 +59,17 @@ paddle::experimental::Tensor conv2d_final_state_dygraph_function(
       paddle::imperative::AutoCastGuard guard(
           egr::Controller::Instance().GetCurrentTracer(),
           paddle::imperative::AmpLevel::O0);
-      return conv2d_final_state_dygraph_function(NEW_input,
-                                                 NEW_filter,
-                                                 strides,
-                                                 paddings,
-                                                 paddding_algorithm,
-                                                 groups,
-                                                 dilations,
-                                                 data_format,
-                                                 use_addto,
-                                                 workspace_size_MB,
-                                                 exhaustive_search);
+      return conv2d_dygraph_function(NEW_input,
+                                     NEW_filter,
+                                     strides,
+                                     paddings,
+                                     paddding_algorithm,
+                                     groups,
+                                     dilations,
+                                     data_format,
+                                     use_addto,
+                                     workspace_size_MB,
+                                     exhaustive_search);
     }
   }
 
@@ -80,7 +80,7 @@ paddle::experimental::Tensor conv2d_final_state_dygraph_function(
       egr::EagerUtils::nullable_autograd_meta(filter);
   // Forward API Call
   VLOG(3) << "Final State Running: "
-          << "conv2d_final_state_dygraph_function";
+          << "conv2d_dygraph_function";
   auto api_result = paddle::experimental::conv2d(input,
                                                  filter,
                                                  strides,
diff --git a/paddle/fluid/eager/api/manual/eager_manual/nodes/add_n_node.cc b/paddle/fluid/eager/api/manual/eager_manual/nodes/add_n_node.cc
index e314c0c2b5b4e..b0dc4f59ffda5 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/nodes/add_n_node.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/add_n_node.cc
@@ -64,8 +64,8 @@ AddNGradNodeFinal::operator()(
 
   // dygraph function
   for (size_t i = 0; i < returns[0].size(); i++) {
-    returns[0][i] = ::scale_final_state_dygraph_function(
-        out_grad, phi::Scalar(1.0), 0.0, true);
+    returns[0][i] =
+        ::scale_dygraph_function(out_grad, phi::Scalar(1.0), 0.0, true);
   }
 
   // Check NaN and Inf id needed
diff --git a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
index 162801c716962..3c1f6835c302a 100644
--- a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
@@ -1,4 +1,4 @@
-add_subdirectory(final_state_generator)
+add_subdirectory(generator)
 
 set(EAGER_GENERETOR_DEPS
     ${GLOB_OP_LIB}
@@ -88,7 +88,7 @@ if(WIN32)
   endif()
 
   add_custom_target(
-    eager_codegen
+    legacy_eager_codegen
     COMMAND
       "${eager_generator_path}/eager_generator.exe"
       "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated"
@@ -97,7 +97,7 @@ if(WIN32)
     VERBATIM)
 else()
   add_custom_target(
-    eager_codegen
+    legacy_eager_codegen
     COMMAND
       ${CMAKE_COMMAND} -E env
       "LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${CMAKE_CURRENT_BINARY_DIR}/../../pybind"
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index 519e0a1ed567a..d6b04f92dfc8d 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -37,11 +37,11 @@ namespace framework {
 
 // To handle append_op at python-level
 std::unordered_map<std::string, std::vector<std::string>>
-    core_ops_returns_info = {};
-std::unordered_map<std::string, std::vector<std::string>> core_ops_args_info =
-    {};
+    core_ops_legacy_returns_info = {};
 std::unordered_map<std::string, std::vector<std::string>>
-    core_ops_args_type_info = {};
+    core_ops_legacy_args_info = {};
+std::unordered_map<std::string, std::vector<std::string>>
+    core_ops_legacy_args_type_info = {};
 
 /* --- Static maps to handle corner cases --- */
 static std::unordered_map<std::string, paddle::framework::AttributeMap>
@@ -1473,10 +1473,10 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
 
   std::string dygraph_function_args_str = "";
   std::string amp_function_call_args_str = "";
-  core_ops_args_info[op_type] = {};
-  core_ops_args_type_info[op_type] = {};
-  core_ops_args_info[op_type].resize(in_vars.size());
-  core_ops_args_type_info[op_type].resize(in_vars.size());
+  core_ops_legacy_args_info[op_type] = {};
+  core_ops_legacy_args_type_info[op_type] = {};
+  core_ops_legacy_args_info[op_type].resize(in_vars.size());
+  core_ops_legacy_args_type_info[op_type].resize(in_vars.size());
 
   /* ------ Dygraph forward function generation ------ */
   generated_function_body += "  // Dygraph Forward Pass\n";
@@ -1500,7 +1500,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
       amp_function_call_args_str_list[input_position] =
           " NEW_" + LegalizeVarName(input_name);
 
-      core_ops_args_type_info[op_type][input_position] = "list";
+      core_ops_legacy_args_type_info[op_type][input_position] = "list";
     } else {
       // inplace tensor can't be const
       const char* FWD_INS_ARG_TEMPLATE;
@@ -1522,9 +1522,9 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
       amp_function_call_args_str_list[input_position] =
           " NEW_" + LegalizeVarName(input_name);
 
-      core_ops_args_type_info[op_type][input_position] = "tensor";
+      core_ops_legacy_args_type_info[op_type][input_position] = "tensor";
     }
-    core_ops_args_info[op_type][input_position] = input_name;
+    core_ops_legacy_args_info[op_type][input_position] = input_name;
 
     if (input.dispensable()) continue;
 
@@ -1666,7 +1666,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
         dygraph_function_args_str += arg_str;
         amp_function_call_args_str += (", " + LegalizeVarName(output_var_name));
 
-        core_ops_args_type_info[op_type].push_back("list");
+        core_ops_legacy_args_type_info[op_type].push_back("list");
       } else {
         const char* FWD_NUM_ARG_TEMPLATE = ", paddle::experimental::Tensor* %s";
         std::string arg_str = paddle::string::Sprintf(
@@ -1674,7 +1674,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
         dygraph_function_args_str += arg_str;
         amp_function_call_args_str += (", " + LegalizeVarName(output_var_name));
 
-        core_ops_args_type_info[op_type].push_back("tensor");
+        core_ops_legacy_args_type_info[op_type].push_back("tensor");
       }
 
       if (BeSameAsInput(output_name, input_names)) {
@@ -1693,7 +1693,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
                                     output_name,
                                     LegalizeVarName(output_var_name));
       }
-      core_ops_args_info[op_type].push_back(output_name);
+      core_ops_legacy_args_info[op_type].push_back(output_name);
 
     } else if (!forward_inplace_map.empty() &&
                forward_inplace_map.count(output_name)) {
@@ -1727,8 +1727,8 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
             "{ \"%s\", egr::EagerUtils::CreateVars(%s) },";
         outs_contents_str += paddle::string::Sprintf(
             FWD_OUTS_CONTENT_TEMPLATE, output_name, outnum);
-        core_ops_args_info[op_type].push_back(outnum);
-        core_ops_args_type_info[op_type].push_back("int");
+        core_ops_legacy_args_info[op_type].push_back(outnum);
+        core_ops_legacy_args_type_info[op_type].push_back("int");
       } else {
         const char* FWD_OUTS_CONTENT_TEMPLATE =
             "{ \"%s\", "
@@ -2003,10 +2003,11 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   VLOG(6) << "Converted Output VarBase to EagerVariable(s)";
   /* ------ END Generate TraceOp ----- */
 
-  // [Generation] Handle core_ops_returns_info
-  // avoid inplace op changing core_ops_returns_info
-  if (core_ops_returns_info.empty() || !core_ops_returns_info.count(op_type)) {
-    core_ops_returns_info[op_type] = return_contents;
+  // [Generation] Handle core_ops_legacy_returns_info
+  // avoid inplace op changing core_ops_legacy_returns_info
+  if (core_ops_legacy_returns_info.empty() ||
+      !core_ops_legacy_returns_info.count(op_type)) {
+    core_ops_legacy_returns_info[op_type] = return_contents;
   }
 
   // [Generation] ComputeRequireGrad -> GradNodeCreation
@@ -2983,13 +2984,13 @@ static std::string GenerateDygraphHFileIncludes() {
 
   dygraph_forward_api_includes_str +=
       "extern std::unordered_map<std::string, std::vector<std::string>> "
-      "core_ops_args_info;\n";
+      "core_ops_legacy_args_info;\n";
   dygraph_forward_api_includes_str +=
       "extern std::unordered_map<std::string, std::vector<std::string>> "
-      "core_ops_args_type_info;\n";
+      "core_ops_legacy_args_type_info;\n";
   dygraph_forward_api_includes_str +=
       "extern std::unordered_map<std::string, std::vector<std::string>> "
-      "core_ops_returns_info;\n\n";
+      "core_ops_legacy_returns_info;\n\n";
 
   return dygraph_forward_api_includes_str;
 }
@@ -3060,7 +3061,7 @@ static void GenerateNodeCCFile(const std::string& node_cc_path,
 static std::string ConvertCoreOpsInfosToString(
     const std::unordered_map<std::string, std::vector<std::string>>&
         core_ops_info) {
-  std::string core_ops_returns_info_init_str = "";
+  std::string core_ops_legacy_returns_info_init_str = "";
   for (const auto& iter : core_ops_info) {
     const char* Core_Ops_Returns_TEMPLATE = "{ \"%s\", { %s } },\n";
     const std::string& op_type = iter.first;
@@ -3074,23 +3075,23 @@ static std::string ConvertCoreOpsInfosToString(
     if (returns_str.size() > 0) returns_str.pop_back();
     std::string op_type_init_str = paddle::string::Sprintf(
         Core_Ops_Returns_TEMPLATE, op_type, returns_str);
-    core_ops_returns_info_init_str += op_type_init_str;
+    core_ops_legacy_returns_info_init_str += op_type_init_str;
   }
 
   // Remove trailing ','
-  if (core_ops_returns_info_init_str.size() > 0)
-    core_ops_returns_info_init_str.pop_back();
+  if (core_ops_legacy_returns_info_init_str.size() > 0)
+    core_ops_legacy_returns_info_init_str.pop_back();
 
-  return core_ops_returns_info_init_str;
+  return core_ops_legacy_returns_info_init_str;
 }
 
 static std::string GenerateCoreOpsArgsInfo() {
   const char* Core_Ops_Returns_MAP_TEMPLATE =
       "std::unordered_map<std::string, std::vector<std::string>> "
-      "core_ops_args_info = { %s };\n";
+      "core_ops_legacy_args_info = { %s };\n";
 
   std::string core_ops_args_info_init_str =
-      ConvertCoreOpsInfosToString(core_ops_args_info);
+      ConvertCoreOpsInfosToString(core_ops_legacy_args_info);
 
   std::string core_ops_info_str = paddle::string::Sprintf(
       Core_Ops_Returns_MAP_TEMPLATE, core_ops_args_info_init_str);
@@ -3101,10 +3102,10 @@ static std::string GenerateCoreOpsArgsInfo() {
 static std::string GenerateCoreOpsArgsTypeInfo() {
   const char* Core_Ops_Returns_MAP_TEMPLATE =
       "std::unordered_map<std::string, std::vector<std::string>> "
-      "core_ops_args_type_info = { %s };\n";
+      "core_ops_legacy_args_type_info = { %s };\n";
 
   std::string core_ops_args_type_info_init_str =
-      ConvertCoreOpsInfosToString(core_ops_args_type_info);
+      ConvertCoreOpsInfosToString(core_ops_legacy_args_type_info);
 
   std::string core_ops_info_str = paddle::string::Sprintf(
       Core_Ops_Returns_MAP_TEMPLATE, core_ops_args_type_info_init_str);
@@ -3115,13 +3116,13 @@ static std::string GenerateCoreOpsArgsTypeInfo() {
 static std::string GenerateCoreOpsReturnsInfo() {
   const char* Core_Ops_Returns_MAP_TEMPLATE =
       "std::unordered_map<std::string, std::vector<std::string>> "
-      "core_ops_returns_info = { %s };\n";
+      "core_ops_legacy_returns_info = { %s };\n";
 
-  std::string core_ops_returns_info_init_str =
-      ConvertCoreOpsInfosToString(core_ops_returns_info);
+  std::string core_ops_legacy_returns_info_init_str =
+      ConvertCoreOpsInfosToString(core_ops_legacy_returns_info);
 
   std::string core_ops_info_str = paddle::string::Sprintf(
-      Core_Ops_Returns_MAP_TEMPLATE, core_ops_returns_info_init_str);
+      Core_Ops_Returns_MAP_TEMPLATE, core_ops_legacy_returns_info_init_str);
 
   return core_ops_info_str;
 }
diff --git a/paddle/fluid/eager/auto_code_generator/generate_file_structures.py b/paddle/fluid/eager/auto_code_generator/generate_file_structures.py
index 9fbf1ed6cd4a1..431bbdea7e071 100644
--- a/paddle/fluid/eager/auto_code_generator/generate_file_structures.py
+++ b/paddle/fluid/eager/auto_code_generator/generate_file_structures.py
@@ -124,7 +124,7 @@ def GenerateFileStructureForIntermediateDygraph(eager_dir, split_count):
                 ".tmp.cc\" \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes"
                 + str(i + 1) + ".cc\"\n")
 
-        f.write("  DEPENDS eager_codegen\n")
+        f.write("  DEPENDS legacy_eager_codegen\n")
         f.write("  VERBATIM)\n")
 
         f.write("cc_library(dygraph_node SRCS ")
@@ -154,7 +154,7 @@ def GenerateFileStructureForIntermediateDygraph(eager_dir, split_count):
         f.write(
             "  COMMAND ${CMAKE_COMMAND} -E copy_if_different \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions_returns_info.tmp.cc\" \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions_returns_info.cc\"\n"
         )
-        f.write("  DEPENDS eager_codegen\n")
+        f.write("  DEPENDS legacy_eager_codegen\n")
         f.write("  VERBATIM)\n")
 
         f.write("cc_library(dygraph_function SRCS ")
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/generator/CMakeLists.txt
similarity index 89%
rename from paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
rename to paddle/fluid/eager/auto_code_generator/generator/CMakeLists.txt
index 6a4c577f5e5f3..aeceb50573e9b 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/generator/CMakeLists.txt
@@ -34,10 +34,10 @@ set(fwd_api_yaml_path
 
 message("Final State Eager CodeGen")
 add_custom_target(
-  eager_final_state_codegen
+  eager_codegen
   COMMAND
     "${PYTHON_EXECUTABLE}"
-    "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py"
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py"
     "--api_yaml_path=${api_yaml_path},${fwd_api_yaml_path}"
     "--backward_yaml_path=${backward_yaml_path}"
     "--forwards_cc_path=${tmp_forwards_cc_path}"
@@ -54,16 +54,15 @@ add_custom_target(
   VERBATIM)
 
 set(tmp_python_c_output_path
-    "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/eager_final_state_op_function.cc.tmp"
-)
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/eager_op_function.cc.tmp")
 set(python_c_output_path
-    "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/eager_final_state_op_function.cc")
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/eager_op_function.cc")
 
 add_custom_target(
-  eager_final_state_python_c_codegen
+  eager_python_c_codegen
   COMMAND
     "${PYTHON_EXECUTABLE}"
-    "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py"
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py"
     "--api_yaml_path=${api_yaml_path},${fwd_api_yaml_path}"
     "--output_path=${tmp_python_c_output_path}"
   COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_python_c_output_path}
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/generator/codegen_utils.py
similarity index 99%
rename from paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
rename to paddle/fluid/eager/auto_code_generator/generator/codegen_utils.py
index 56095963d4938..1dbea53b3797a 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/codegen_utils.py
@@ -165,7 +165,7 @@ def str2Hump(text):
 
 
 def GetDygraphForwardFunctionName(string):
-    return f"{string}_final_state_dygraph_function"
+    return f"{string}_dygraph_function"
 
 
 def GetIntermediateAPIFunctionName(string):
@@ -198,7 +198,7 @@ def GetInplacedFunctionName(function_name):
 
 
 def GetForwardFunctionName(string):
-    return f"{string}_final_state_dygraph_function"
+    return f"{string}_dygraph_function"
 
 
 def GetIndent(num):
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
similarity index 98%
rename from paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
rename to paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
index 3e4efba3f404b..0688cd5179af3 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
@@ -349,13 +349,13 @@ class {} : public egr::GradNodeBase {{
 
 CORE_OPS_INFO_TEMPLATE = \
 """
-std::unordered_map<std::string, std::vector<std::string>> core_ops_final_state_args_info = {{
+std::unordered_map<std::string, std::vector<std::string>> core_ops_args_info = {{
     {}
 }};
-std::unordered_map<std::string, std::vector<std::string>> core_ops_final_state_args_type_info = {{
+std::unordered_map<std::string, std::vector<std::string>> core_ops_args_type_info = {{
     {}
 }};
-std::unordered_map<std::string, std::vector<std::string>> core_ops_final_state_returns_info = {{
+std::unordered_map<std::string, std::vector<std::string>> core_ops_returns_info = {{
     {}
 }};
 
@@ -363,9 +363,9 @@ class {} : public egr::GradNodeBase {{
 
 CORE_OPS_DECLARATION_TEMPLATE = \
 """
-extern std::unordered_map<std::string, std::vector<std::string>> core_ops_final_state_args_info;
-extern std::unordered_map<std::string, std::vector<std::string>> core_ops_final_state_args_type_info;
-extern std::unordered_map<std::string, std::vector<std::string>> core_ops_final_state_returns_info;
+extern std::unordered_map<std::string, std::vector<std::string>> core_ops_args_info;
+extern std::unordered_map<std::string, std::vector<std::string>> core_ops_args_type_info;
+extern std::unordered_map<std::string, std::vector<std::string>> core_ops_returns_info;
 
 """
 
@@ -1260,31 +1260,24 @@ def UpdateCoreOpsInformation(self, is_inplaced):
             forward_inputs_position_map.keys()) + len(forward_attrs_list)
         num_returns = len(forward_outputs_position_map.keys())
 
-        final_state_fwd_api_name = "final_state_" + forward_api_name
-        core_ops_returns_info[final_state_fwd_api_name] = [
-            "" for i in range(num_returns)
-        ]
-        core_ops_args_info[final_state_fwd_api_name] = [
-            "" for i in range(num_args)
-        ]
-        core_ops_args_type_info[final_state_fwd_api_name] = [
-            "" for i in range(num_args)
-        ]
+        fwd_api_name = "" + forward_api_name
+        core_ops_returns_info[fwd_api_name] = ["" for i in range(num_returns)]
+        core_ops_args_info[fwd_api_name] = ["" for i in range(num_args)]
+        core_ops_args_type_info[fwd_api_name] = ["" for i in range(num_args)]
 
         for name, (ttype, pos) in forward_inputs_position_map.items():
-            core_ops_args_info[final_state_fwd_api_name][pos] = name
+            core_ops_args_info[fwd_api_name][pos] = name
             if IsPlainTensorType(ttype):
-                core_ops_args_type_info[final_state_fwd_api_name][
-                    pos] = "tensor"
+                core_ops_args_type_info[fwd_api_name][pos] = "tensor"
             else:
                 assert IsVectorTensorType(ttype)
-                core_ops_args_type_info[final_state_fwd_api_name][pos] = "list"
+                core_ops_args_type_info[fwd_api_name][pos] = "list"
 
         for name, _, _, pos in forward_attrs_list:
-            core_ops_args_info[final_state_fwd_api_name][pos] = name
+            core_ops_args_info[fwd_api_name][pos] = name
 
         for name, (ttype, pos) in forward_outputs_position_map.items():
-            core_ops_returns_info[final_state_fwd_api_name][pos] = name
+            core_ops_returns_info[fwd_api_name][pos] = name
 
     def run(self):
         super().run()
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py
similarity index 93%
rename from paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
rename to paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py
index 4d5f5c9d61e80..a763b83bb9ec4 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py
@@ -36,6 +36,7 @@ def SkipAPIGeneration(forward_api_name):
     "long": "CastPyArg2Long",
     "int64_t": "CastPyArg2Long",
     "float": "CastPyArg2Float",
+    "double": "CastPyArg2Double",
     "std::string": "CastPyArg2String",
     "std::vector<bool>": "CastPyArg2Booleans",
     "std::vector<int>": "CastPyArg2Ints",
@@ -84,7 +85,7 @@ def FindParsingFunctionFromAttributeType(atype):
 
 PYTHON_C_FUNCTION_TEMPLATE = \
 """
-static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObject *kwargs) {{
+static PyObject * eager_api_{}(PyObject *self, PyObject *args, PyObject *kwargs) {{
   {}
 
   PyThreadState *tstate = nullptr;
@@ -145,7 +146,7 @@ def FindParsingFunctionFromAttributeType(atype):
 
 PYTHON_C_FUNCTION_REG_TEMPLATE = \
 """
-{{\"final_state_{}{}\", (PyCFunction)(void(*)(void)) {}eager_final_state_api_{}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {} in dygraph.\"}}
+{{\"{}{}\", (PyCFunction)(void(*)(void)) {}eager_api_{}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {} in dygraph.\"}}
 
 """
 
@@ -161,7 +162,7 @@ def FindParsingFunctionFromAttributeType(atype):
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/fluid/pybind/op_function_common.h"
 #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
-#include "paddle/fluid/pybind/eager_final_state_custom_python_api.h"
+#include "paddle/fluid/pybind/eager_custom_python_api.h"
 #include "paddle/fluid/pybind/eager.h"
 #include "paddle/fluid/eager/amp_utils.h"
 #include "paddle/fluid/eager/eager_amp_auto_cast.h"
@@ -192,11 +193,11 @@ def FindParsingFunctionFromAttributeType(atype):
 
 CORE_OPS_INFO = \
 """
-static PyObject * eager_get_final_state_core_ops_args_info(PyObject *self) {
+static PyObject * eager_get_core_ops_args_info(PyObject *self) {
     PyThreadState *tstate = nullptr;
     try
     {
-      return ToPyObject(core_ops_final_state_args_info);
+      return ToPyObject(core_ops_args_info);
     }
     catch(...) {
       if (tstate) {
@@ -207,11 +208,11 @@ def FindParsingFunctionFromAttributeType(atype):
     }
 }
 
-static PyObject * eager_get_final_state_core_ops_args_type_info(PyObject *self) {
+static PyObject * eager_get_core_ops_args_type_info(PyObject *self) {
     PyThreadState *tstate = nullptr;
     try
     {
-      return ToPyObject(core_ops_final_state_args_type_info);
+      return ToPyObject(core_ops_args_type_info);
     }
     catch(...) {
       if (tstate) {
@@ -222,11 +223,11 @@ def FindParsingFunctionFromAttributeType(atype):
     }
 }
 
-static PyObject * eager_get_final_state_core_ops_returns_info(PyObject *self) {
+static PyObject * eager_get_core_ops_returns_info(PyObject *self) {
     PyThreadState *tstate = nullptr;
     try
     {
-      return ToPyObject(core_ops_final_state_returns_info);
+      return ToPyObject(core_ops_returns_info);
     }
     catch(...) {
       if (tstate) {
@@ -241,16 +242,16 @@ def FindParsingFunctionFromAttributeType(atype):
 
 CORE_OPS_INFO_REGISTRY = \
 """
-    {\"get_final_state_core_ops_args_info\",
-    (PyCFunction)(void(*)(void))eager_get_final_state_core_ops_args_info, METH_NOARGS,
-    \"C++ interface function for eager_get_final_state_core_ops_args_info.\"},
-    {\"get_final_state_core_ops_args_type_info\",
-    (PyCFunction)(void(*)(void))eager_get_final_state_core_ops_args_type_info,
+    {\"get_core_ops_args_info\",
+    (PyCFunction)(void(*)(void))eager_get_core_ops_args_info, METH_NOARGS,
+    \"C++ interface function for eager_get_core_ops_args_info.\"},
+    {\"get_core_ops_args_type_info\",
+    (PyCFunction)(void(*)(void))eager_get_core_ops_args_type_info,
     METH_NOARGS,
-    \"C++ interface function for eager_get_final_state_core_ops_args_type_info.\"},
-    {\"get_final_state_core_ops_returns_info\",
-    (PyCFunction)(void(*)(void))eager_get_final_state_core_ops_returns_info,
-    METH_NOARGS, \"C++ interface function for eager_get_final_state_core_ops_returns_info.\"},
+    \"C++ interface function for eager_get_core_ops_args_type_info.\"},
+    {\"get_core_ops_returns_info\",
+    (PyCFunction)(void(*)(void))eager_get_core_ops_returns_info,
+    METH_NOARGS, \"C++ interface function for eager_get_core_ops_returns_info.\"},
 """
 
 NAMESPACE_WRAPPER_TEMPLATE = \
diff --git a/paddle/fluid/eager/eager_amp_auto_cast.h b/paddle/fluid/eager/eager_amp_auto_cast.h
index d1813ae3de1dd..4ebc2860c59d9 100644
--- a/paddle/fluid/eager/eager_amp_auto_cast.h
+++ b/paddle/fluid/eager/eager_amp_auto_cast.h
@@ -45,7 +45,7 @@ inline paddle::experimental::Tensor Cast(
     const bool trace_backward = true) {
   if (input.is_sparse_coo_tensor() || input.is_sparse_csr_tensor()) {
     if (trace_backward) {
-      return sparse::cast_final_state_dygraph_function(
+      return sparse::cast_dygraph_function(
           input, paddle::experimental::DataType::UNDEFINED, dst_dtype);
     } else {
       return paddle::experimental::sparse::cast(
@@ -53,7 +53,7 @@ inline paddle::experimental::Tensor Cast(
     }
   } else {
     if (trace_backward) {
-      return cast_final_state_dygraph_function(input, dst_dtype);
+      return cast_dygraph_function(input, dst_dtype);
     } else {
       return paddle::experimental::cast(input, dst_dtype);
     }
diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc
index 231d81b5e73a6..afd9e4ef865ff 100644
--- a/paddle/fluid/eager/grad_tensor_holder.cc
+++ b/paddle/fluid/eager/grad_tensor_holder.cc
@@ -143,7 +143,7 @@ void GradTensorHolder::add(size_t slot_id,
     if (t.is_dense_tensor()) {
       if (buffer_tensor.is_dense_tensor()) {
         if (create_graph || t.is_custom_device()) {
-          buffer_tensor = add_final_state_dygraph_function(t, buffer_tensor);
+          buffer_tensor = add_dygraph_function(t, buffer_tensor);
         } else {
           paddle::imperative::TensorAdd<paddle::experimental::Tensor>(
               t, &buffer_tensor);
@@ -170,8 +170,7 @@ void GradTensorHolder::add(size_t slot_id,
             std::make_shared<phi::DenseTensor>(
                 buffer_sparse->non_zero_elements()));
         if (create_graph || t.is_custom_device()) {
-          buffer_values =
-              add_final_state_dygraph_function(t_values, buffer_values);
+          buffer_values = add_dygraph_function(t_values, buffer_values);
         } else {
           paddle::imperative::TensorAdd<paddle::experimental::Tensor>(
               t_values, &buffer_values);
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
index b41938d4856dd..144ceab1e4983 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
@@ -77,8 +77,7 @@ void benchmark_eager_matmul(const paddle::experimental::Tensor& X,
 
   size_t max_num_runs = accuracy_check ? 2 : max_num_benchmark_runs;
   for (size_t i = 0; i < max_num_runs; i++) {
-    input_tensor0 =
-        matmul_final_state_dygraph_function(input_tensor0, Y, false, false);
+    input_tensor0 = matmul_dygraph_function(input_tensor0, Y, false, false);
   }
 
   std::vector<paddle::experimental::Tensor> target_tensors = {input_tensor0};
diff --git a/paddle/fluid/framework/attribute.h b/paddle/fluid/framework/attribute.h
index b4a939f822b82..a82e8e7e76831 100644
--- a/paddle/fluid/framework/attribute.h
+++ b/paddle/fluid/framework/attribute.h
@@ -180,6 +180,37 @@ struct ExtractAttribute<float> {
   const std::string& attr_name_;
 };
 
+template <>
+struct ExtractAttribute<double> {
+  explicit ExtractAttribute(const std::string& attr_name)
+      : attr_name_(attr_name) {}
+
+  double* operator()(Attribute& attr) const {
+    if (attr.type() == typeid(int)) {  // NOLINT
+      int val = PADDLE_GET_CONST(int, attr);
+      attr = static_cast<double>(val);
+    } else if (attr.type() == typeid(int64_t)) {  // NOLINT
+      int64_t val = PADDLE_GET_CONST(int64_t, attr);
+      attr = static_cast<double>(val);
+    } else if (attr.type() == typeid(float)) {  // NOLINT
+      int64_t val = PADDLE_GET_CONST(float, attr);
+      attr = static_cast<double>(val);
+    }
+    double* attr_value = nullptr;
+    try {
+      attr_value = &paddle::get<double>(attr);
+    } catch (paddle::bad_variant_access const& bad_get) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Cannot get attribute (%s) by type double, its type is %s.",
+          attr_name_,
+          paddle::platform::demangle(attr.type().name())));
+    }
+    return attr_value;
+  }
+
+  const std::string& attr_name_;
+};
+
 template <>
 struct ExtractAttribute<std::vector<double>> {
   explicit ExtractAttribute(const std::string& attr_name)
diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto
index 61a495a59a9f4..2a56dc60335d9 100644
--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -38,6 +38,7 @@ enum AttrType {
   FLOAT64S = 12;
   VAR = 13;
   VARS = 14;
+  FLOAT64 = 15;
 }
 
 // OpDesc describes an instance of a C++ framework::OperatorBase
@@ -62,6 +63,7 @@ message OpDesc {
     repeated double float64s = 16;
     optional string var_name = 17;
     repeated string vars_name = 18;
+    optional double float64 = 19;
   };
 
   message Var {
diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc
index debe43fab8250..3a451c19ec20e 100644
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -482,6 +482,10 @@ CompatInferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
               infer_meta_context.EmplaceBackAttr(
                   phi::Scalar(PADDLE_GET_CONST(float, attr)));
               break;
+            case framework::proto::AttrType::FLOAT64:
+              infer_meta_context.EmplaceBackAttr(
+                  phi::Scalar(PADDLE_GET_CONST(double, attr)));
+              break;
             case framework::proto::AttrType::INT:
               infer_meta_context.EmplaceBackAttr(
                   phi::Scalar(PADDLE_GET_CONST(int, attr)));
@@ -651,6 +655,10 @@ CompatInferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
             case phi::AttributeType::FLOAT32:
               infer_meta_context.EmplaceBackAttr(PADDLE_GET_CONST(float, attr));
               break;
+            case phi::AttributeType::FLOAT64:
+              infer_meta_context.EmplaceBackAttr(
+                  PADDLE_GET_CONST(double, attr));
+              break;
             case phi::AttributeType::INT32:
               infer_meta_context.EmplaceBackAttr(PADDLE_GET_CONST(int, attr));
               break;
diff --git a/paddle/fluid/framework/new_executor/data_transfer.cc b/paddle/fluid/framework/new_executor/data_transfer.cc
index 32277ed54bb7f..6d0f79119459e 100644
--- a/paddle/fluid/framework/new_executor/data_transfer.cc
+++ b/paddle/fluid/framework/new_executor/data_transfer.cc
@@ -18,6 +18,10 @@
 #include "paddle/phi/core/kernel_context.h"
 #include "paddle/phi/core/kernel_factory.h"
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/phi/backends/onednn/onednn_context.h"
+#endif
+
 namespace paddle {
 namespace framework {
 namespace interpreter {
@@ -200,11 +204,27 @@ std::shared_ptr<OperatorBase> TransferLayout(const std::string& var_name,
                                              framework::Scope* local_scope,
                                              bool is_fetch_v2) {
 #ifdef PADDLE_WITH_MKLDNN
+
   // NOTE(zhiqiu): hot fix, follow the same logic in DataCopy() in fetch_op.cc
   if (in_layout == framework::DataLayout::kMKLDNN &&
       var_name == framework::GradVarName("Filter") && is_fetch_v2) {
+    VLOG(4) << "Match special case(Filter && fetch_v2) " << var_name;
     out_layout = framework::DataLayout::kNCHW;
   }
+
+  if (in_layout == framework::DataLayout::MKLDNN &&
+      out_layout != framework::DataLayout::MKLDNN) {
+    auto target_layout = phi::OneDNNContext::tls().get_cur_paddle_data_layout();
+    VLOG(4) << "TransDataLayoutFromMKLDNN: " << in_layout << "->"
+            << target_layout;
+
+    if (out_layout == DataLayout::kNCHW &&
+        var_name == framework::GradVarName("Filter")) {
+      VLOG(4) << "Match special case(Filter) " << var_name;
+      target_layout = out_layout;
+    }
+    out_layout = target_layout;
+  }
 #endif
 
   // 1. Generate new_var_name and Initialize it
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 6b6eb3f8d674b..f57a99e84cce6 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -566,12 +566,16 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
                                            : var_scope_.GetMutableScope();
 
 #ifdef PADDLE_WITH_ASCEND_CL
-  // NOTE(wangxi): nan/inf cannot be detected on NPU by checking the variable
-  // values, but only through special `float_status` to checks whether
-  // the operation is overflow. More about `float_status`, see:
-  // https://gitee.com/ascend/modelzoo/issues/I3NF8V?from=project-issue
-  if (FLAGS_check_nan_inf) {
-    framework::details::NPUAllocAndClearFloatStatus(*op, *local_scope, place);
+  if (platform::is_npu_place(place)) {
+    auto dev_id = place.device;
+    platform::SetNPUDeviceId(dev_id);
+    // NOTE(wangxi): nan/inf cannot be detected on NPU by checking the variable
+    // values, but only through special `float_status` to checks whether
+    // the operation is overflow. More about `float_status`, see:
+    // https://gitee.com/ascend/modelzoo/issues/I3NF8V?from=project-issue
+    if (FLAGS_check_nan_inf) {
+      framework::details::NPUAllocAndClearFloatStatus(*op, *local_scope, place);
+    }
   }
 #endif
 
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index e5d8f6f9f0e30..507f7cd166ea0 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -668,6 +668,12 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
         this->attrs_[name] = std::vector<float>();
         break;
       }
+      case proto::AttrType::FLOAT64S: {
+        VLOG(11) << "SetAttr: " << Type() << ", " << name
+                 << " from INTS to FLOAT64S";
+        this->attrs_[name] = std::vector<double>();
+        break;
+      }
       case proto::AttrType::STRINGS: {
         VLOG(11) << "SetAttr: " << Type() << ", " << name
                  << " from INTS to STRINGS";
@@ -838,6 +844,7 @@ struct SetAttrDescVisitor {
   mutable proto::OpDesc::Attr *attr_;
   void operator()(int v) const { attr_->set_i(v); }
   void operator()(float v) const { attr_->set_f(v); }
+  void operator()(double v) const { attr_->set_float64(v); }
   void operator()(const std::string &v) const { attr_->set_s(v); }
 
   // Please refer to https://github.com/PaddlePaddle/Paddle/issues/7162
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index b5d6a3786c3d9..23fce93ef30a3 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -2745,6 +2745,10 @@ void OperatorWithKernel::BuildPhiKernelContext(
               phi_kernel_context->EmplaceBackAttr(std::move(
                   phi::Scalar(PADDLE_GET_CONST(float, attr_iter->second))));
               break;
+            case proto::AttrType::FLOAT64:
+              phi_kernel_context->EmplaceBackAttr(std::move(
+                  phi::Scalar(PADDLE_GET_CONST(double, attr_iter->second))));
+              break;
             case proto::AttrType::INT:
               phi_kernel_context->EmplaceBackAttr(std::move(
                   phi::Scalar(PADDLE_GET_CONST(int, attr_iter->second))));
@@ -2884,6 +2888,10 @@ void OperatorWithKernel::BuildPhiKernelContext(
             phi_kernel_context->EmplaceBackAttr(
                 PADDLE_GET_CONST(float, attr_iter->second));
             break;
+          case phi::AttributeType::FLOAT64:
+            phi_kernel_context->EmplaceBackAttr(
+                PADDLE_GET_CONST(double, attr_iter->second));
+            break;
           case phi::AttributeType::INT32:
             phi_kernel_context->EmplaceBackAttr(
                 PADDLE_GET_CONST(int, attr_iter->second));
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
index 31a006914aca7..d4739209e7ae7 100644
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -58,7 +58,8 @@ using Attribute = paddle::variant<paddle::blank,
                                   std::vector<int64_t>,
                                   std::vector<double>,
                                   VarDesc*,
-                                  std::vector<VarDesc*>>;
+                                  std::vector<VarDesc*>,
+                                  double>;
 using AttributeMap = std::unordered_map<std::string, Attribute>;
 
 #ifdef PADDLE_WITH_ASCEND_CL
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index d043b4a5aad18..1e76757e1c048 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -412,6 +412,10 @@ void BuildDygraphPhiKernelContext(const phi::KernelSignature& kernel_signature,
               kernel_ctx->EmplaceBackAttr(
                   std::move(phi::Scalar(PADDLE_GET_CONST(float, attr))));
               break;
+            case framework::proto::AttrType::FLOAT64:
+              kernel_ctx->EmplaceBackAttr(
+                  std::move(phi::Scalar(PADDLE_GET_CONST(double, attr))));
+              break;
             case framework::proto::AttrType::INT:
               kernel_ctx->EmplaceBackAttr(
                   std::move(phi::Scalar(PADDLE_GET_CONST(int, attr))));
@@ -549,6 +553,9 @@ void BuildDygraphPhiKernelContext(const phi::KernelSignature& kernel_signature,
           case phi::AttributeType::FLOAT32:
             kernel_ctx->EmplaceBackAttr(PADDLE_GET_CONST(float, attr));
             break;
+          case phi::AttributeType::FLOAT64:
+            kernel_ctx->EmplaceBackAttr(PADDLE_GET_CONST(double, attr));
+            break;
           case phi::AttributeType::INT32:
             kernel_ctx->EmplaceBackAttr(PADDLE_GET_CONST(int, attr));
             break;
diff --git a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
index 54017666a77d2..0eed1a4f5e71f 100644
--- a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
@@ -56,14 +56,10 @@ class LayerNormOpConverter : public OpConverter {
 
     nvinfer1::ILayer* layernorm_layer = nullptr;
     if (engine_->with_dynamic_shape()) {
-      int statis_num = 1;
       // For dynamic shape,
-      // the batch num will be taken into account in plugin runtime.
-      for (int i = 1; i < begin_norm_axis; i++) {
-        statis_num *= X->getDimensions().d[i];
-      }
-      std::vector<int64_t> mean_shape{statis_num};
-      std::vector<int64_t> variance_shape{statis_num};
+      // the shape of mean and variance will be determine in configuPlugin.
+      std::vector<int64_t> mean_shape{1};
+      std::vector<int64_t> variance_shape{1};
       plugin::LayerNormPluginDynamic* plugin =
           new plugin::LayerNormPluginDynamic(
               static_cast<const float*>(bias_weight.get().values),
@@ -77,7 +73,7 @@ class LayerNormOpConverter : public OpConverter {
       layernorm_layer = engine_->AddDynamicPlugin(&X, 1, plugin);
     } else {
       int statis_num = 1;
-      for (int i = 0; i < begin_norm_axis; i++) {
+      for (int i = 1; i < begin_norm_axis; i++) {
         statis_num *= X->getDimensions().d[i];
       }
       std::vector<int64_t> mean_shape{statis_num};
diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
index a597a484f9e58..8443c92241b0c 100644
--- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
@@ -291,333 +291,342 @@ class MultiheadMatMulOpConverter : public OpConverter {
               plugin_inputs.data(), plugin_inputs.size(), *plugin);
           layer = plugin_layer;
         }
-      }
-      if (input_dims.d[1] <= 384 && !bias_qk_attr &&
-          engine_->precision() != AnalysisConfig::Precision::kFloat32) {
-        /*
-           * input_dims.d[0]: batch(-1)
-           * input_dims.d[1]: length:256
-           * input_dims.d[2]: hidden_size:768
-           input
-             |[b,256,768]
-             |
-          shuffle                 weight   bias
-             |[b,256,768,1,1]      |         |
-             |_____________________|_________|
-             |
-            fc
-             |[b,256,2304,1,1]
-             |
-          shuffle                 mask(fake)  pos   max_length
-             |[b*256,2304,1,1]       |         |        |
-             |                       |         |        |
-             |_______________________|_________|________|
-             |
-             MHA
-             |[b*256,768]
-             |
-          shuffle
-             |[b, 256, 768]
-             |
-             out
-        */
-
-        nvinfer1::Weights weight{nvinfer1::DataType::kFLOAT,
-                                 static_cast<void*>(weight_data),
-                                 static_cast<int32_t>(weight_t->numel())};
-        nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT,
-                               static_cast<void*>(bias_data),
-                               static_cast<int32_t>(bias_t->numel())};
-
-        /*** transpose the weight and bias ***/
-        int head_size = hidden_out / head_number;
-        // [3, head_number, head_size, hidden_in] -> [head_number, 3,
-        // head_size, hidden_in]
-        auto transpose_weight_v2 = [](const float* src,
-                                      float* dst,
-                                      int three,
-                                      int head_number,
-                                      int head_size,
-                                      int hidden_in) {
-          const int HH = head_size * hidden_in;
-          for (int i = 0; i < three; ++i) {
-            for (int n = 0; n < head_number; ++n) {
-              for (int hh = 0; hh < HH; ++hh) {
-                dst[n * three * HH + i * HH + hh] =
-                    src[i * head_number * HH + n * HH + hh];
+      } else {
+        if (input_dims.d[1] <= 384 && !bias_qk_attr &&
+            engine_->precision() != AnalysisConfig::Precision::kFloat32) {
+          /*
+            * input_dims.d[0]: batch(-1)
+            * input_dims.d[1]: length:256
+            * input_dims.d[2]: hidden_size:768
+            input
+              |[b,256,768]
+              |
+            shuffle                 weight   bias
+              |[b,256,768,1,1]      |         |
+              |_____________________|_________|
+              |
+              fc
+              |[b,256,2304,1,1]
+              |
+            shuffle                 mask(fake)  pos   max_length
+              |[b*256,2304,1,1]       |         |        |
+              |                       |         |        |
+              |_______________________|_________|________|
+              |
+              MHA
+              |[b*256,768]
+              |
+            shuffle
+              |[b, 256, 768]
+              |
+              out
+          */
+
+          nvinfer1::Weights weight{nvinfer1::DataType::kFLOAT,
+                                   static_cast<void*>(weight_data),
+                                   static_cast<int32_t>(weight_t->numel())};
+          nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT,
+                                 static_cast<void*>(bias_data),
+                                 static_cast<int32_t>(bias_t->numel())};
+
+          /*** transpose the weight and bias ***/
+          int head_size = hidden_out / head_number;
+          // [3, head_number, head_size, hidden_in] -> [head_number, 3,
+          // head_size, hidden_in]
+          auto transpose_weight_v2 = [](const float* src,
+                                        float* dst,
+                                        int three,
+                                        int head_number,
+                                        int head_size,
+                                        int hidden_in) {
+            const int HH = head_size * hidden_in;
+            for (int i = 0; i < three; ++i) {
+              for (int n = 0; n < head_number; ++n) {
+                for (int hh = 0; hh < HH; ++hh) {
+                  dst[n * three * HH + i * HH + hh] =
+                      src[i * head_number * HH + n * HH + hh];
+                }
               }
             }
-          }
-        };
-        // [3, head_number, head_size] -> [head_number, 3, head_size]
-        auto transpose_bias_v2 =
-            [](const float* src, float* dst, int N, int H) {
-              for (int i = 0; i < 3; ++i) {
-                for (int n = 0; n < N; ++n) {
-                  for (int h = 0; h < H; ++h) {
-                    dst[n * 3 * H + i * H + h] = src[i * N * H + n * H + h];
+          };
+          // [3, head_number, head_size] -> [head_number, 3, head_size]
+          auto transpose_bias_v2 =
+              [](const float* src, float* dst, int N, int H) {
+                for (int i = 0; i < 3; ++i) {
+                  for (int n = 0; n < N; ++n) {
+                    for (int h = 0; h < H; ++h) {
+                      dst[n * 3 * H + i * H + h] = src[i * N * H + n * H + h];
+                    }
                   }
                 }
-              }
-            };
-        memcpy(weight_data_tmp.data(),
-               weight_data,
-               weight_t->numel() * sizeof(float));
-        transpose_weight_v2(weight_data_tmp.data(),
-                            weight_data,
-                            three,
-                            head_number,
-                            head_size,
-                            hidden_in);
-
-        std::vector<float> bias_data_tmp;
-        bias_data_tmp.reserve(bias_t->numel());
-        memcpy(
-            bias_data_tmp.data(), bias_data, bias_t->numel() * sizeof(float));
-        transpose_bias_v2(
-            bias_data_tmp.data(), bias_data, head_number, head_size);
-
-        // add shuffle for FullyConnected layer
-        std::vector<nvinfer1::ITensor*> reshape_before_fc_shape_tensor;
-        nvinfer1::ITensor* input_shape_tensor = Shape(input);
-        for (int i = 0; i < 5; i++) {
-          reshape_before_fc_shape_tensor.push_back(Add1DConstantLayer(1));
-        }
-        for (int i = 0; i < 3; i++) {
-          reshape_before_fc_shape_tensor[i] =
-              GetEleTensorOfShape(input_shape_tensor, i);
-        }
-        auto* reshape_before_fc_layer =
-            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
-        reshape_before_fc_layer->setInput(
-            1, *Concat(reshape_before_fc_shape_tensor));
-        reshape_before_fc_layer->setName(
-            ("shuffle_before_fc_multihead_matmul(Output: " + output_name + ")")
-                .c_str());
-
-        // add fc layer
-        nvinfer1::ILayer* fc_layer = nullptr;
-        fc_layer = TRT_ENGINE_ADD_LAYER(engine_,
-                                        FullyConnected,
-                                        *reshape_before_fc_layer->getOutput(0),
-                                        n,
-                                        weight,
-                                        bias);
-
-        // add shuffle for CustomQKVToContextPluginDynamic layer
-        auto* reshape_after_fc_layer =
-            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *fc_layer->getOutput(0));
-        std::vector<nvinfer1::ITensor*> mha_input_tensor_shape;
-        mha_input_tensor_shape.push_back(Add1DConstantLayer(-1));
-        mha_input_tensor_shape.push_back(
-            Add1DConstantLayer(hidden_out * 3));  // Q,K,V
-        mha_input_tensor_shape.push_back(Add1DConstantLayer(1));
-        mha_input_tensor_shape.push_back(Add1DConstantLayer(1));
-        reshape_after_fc_layer->setInput(1, *Concat(mha_input_tensor_shape));
-        reshape_after_fc_layer->setName(
-            ("shuffle_after_fc_multihead_matmul(Output: " + output_name + ")")
-                .c_str());
-
-        // add mha_plugin
-        auto creator = GetPluginRegistry()->getPluginCreator(
-            "CustomQKVToContextPluginDynamic", "2");
-        assert(creator != nullptr);
-        // set the attributes of mha_plugin
-        int type = static_cast<int>(nvinfer1::DataType::kHALF);
-        int var_seqlen = 1;
-        bool has_mask = true;
-        std::vector<nvinfer1::PluginField> fields{
-            {"hidden_size", &hidden_out, nvinfer1::PluginFieldType::kINT32, 1},
-            {"num_heads", &head_number, nvinfer1::PluginFieldType::kINT32, 1},
-            {"type_id", &type, nvinfer1::PluginFieldType::kINT32, 1},
-            {"has_mask", &has_mask, nvinfer1::PluginFieldType::kINT32, 1},
-            {"var_seqlen", &var_seqlen, nvinfer1::PluginFieldType::kINT32, 1}};
-        nvinfer1::PluginFieldCollection* plugin_collection =
-            static_cast<nvinfer1::PluginFieldCollection*>(
-                malloc(sizeof(*plugin_collection) +
-                       fields.size() *
-                           sizeof(nvinfer1::PluginField)));  // remember to free
-        plugin_collection->nbFields = static_cast<int>(fields.size());
-        plugin_collection->fields = fields.data();
-        auto plugin = creator->createPlugin("CustomQKVToContextPluginDynamic",
-                                            plugin_collection);
-        free(plugin_collection);
-        // set inputs
-        std::vector<nvinfer1::ITensor*> plugin_inputs;
-        // input_0 for plugin
-        plugin_inputs.emplace_back(reshape_after_fc_layer->getOutput(0));
-        // input_1(fake) for plugin
-        std::vector<int> mask = {1};
-        nvinfer1::ITensor* mask_tensor = Add1DConstantLayer(mask);
-        plugin_inputs.emplace_back(mask_tensor);
-        // input_2 for plugin
-        std::vector<int> pos_id = {0};
-        int max_batch = 500;
-        for (int i = 1; i < max_batch; i++) {
-          pos_id.push_back(i);
-        }
-        nvinfer1::ITensor* fake_pos_id_tensor = Add1DConstantLayer(pos_id);
-        nvinfer1::ITensor* length_tensor =
-            GetEleTensorOfShape(input_shape_tensor, 1);
-        auto pos_id_layer =
-            TRT_ENGINE_ADD_LAYER(engine_,
-                                 ElementWise,
-                                 *fake_pos_id_tensor,
-                                 *length_tensor,
-                                 nvinfer1::ElementWiseOperation::kPROD);
-        // size = batch + 1;
-        nvinfer1::ITensor* batch_tensor =
-            GetEleTensorOfShape(input_shape_tensor, 0);
-        std::vector<int> const_data = {1};
-        nvinfer1::ITensor* const_tensor = Add1DConstantLayer(const_data);
-        auto size_layer =
-            TRT_ENGINE_ADD_LAYER(engine_,
-                                 ElementWise,
-                                 *batch_tensor,
-                                 *const_tensor,
-                                 nvinfer1::ElementWiseOperation::kSUM);
-        // get size(batch + 1) data from pos_id_tensor
-        nvinfer1::Dims start;
-        nvinfer1::Dims stride;
-        nvinfer1::Dims size;
-
-        start.nbDims = 1;
-        stride.nbDims = 1;
-        size.nbDims = 1;
-
-        start.d[0] = 0;
-        stride.d[0] = 1;
-        size.d[0] = 1;
-
-        auto* slice_pos_layer = TRT_ENGINE_ADD_LAYER(
-            engine_, Slice, *pos_id_layer->getOutput(0), start, size, stride);
-        slice_pos_layer->setInput(2, *size_layer->getOutput(0));
-        plugin_inputs.emplace_back(slice_pos_layer->getOutput(0));
-
-        // input_3 for plugin
-        std::vector<int> data(500, 1);
-        nvinfer1::ITensor* fake_max_seqlen_tensor = Add1DConstantLayer(data);
-        auto* slice_max_layer = TRT_ENGINE_ADD_LAYER(
-            engine_, Slice, *fake_max_seqlen_tensor, start, size, stride);
-        slice_max_layer->setInput(2, *length_tensor);
-        plugin_inputs.emplace_back(slice_max_layer->getOutput(0));
-        // plugin_layer
-        auto plugin_layer = engine_->network()->addPluginV2(
-            plugin_inputs.data(), plugin_inputs.size(), *plugin);
-
-        // add shuffle
-        auto* reshape_after_mha_layer =
-            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *plugin_layer->getOutput(0));
-        std::vector<nvinfer1::ITensor*> reshape_tensor;
-        reshape_tensor.push_back(batch_tensor);
-        reshape_tensor.push_back(length_tensor);
-        reshape_tensor.push_back(Add1DConstantLayer(-1));
-        reshape_after_mha_layer->setInput(1, *Concat(reshape_tensor));
-        reshape_after_mha_layer->setName(
-            ("shuffle_last_multihead_matmul(Output: " + output_name + ")")
-                .c_str());
-
-        // return
-        layer = reshape_after_mha_layer;
-      } else {
-        PADDLE_ENFORCE_EQ(
-            input->getDimensions().nbDims,
-            3,
-            platform::errors::InvalidArgument(
-                "The Input dim of the MultiheadMatMul should be 3, "
-                "but it's (%d) now.",
-                input->getDimensions().nbDims));
-        // transpose weight_data from m * n to  n * m
-        auto* input_bias_qk =
-            engine_->GetITensor(op_desc.Input("BiasQK").front());
-
-        TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
-                                      static_cast<void*>(weight_data),
-                                      static_cast<size_t>(weight_t->numel())};
-        weight.dims.assign({n, m});
-
-        TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT,
-                                    static_cast<void*>(bias_data),
-                                    static_cast<size_t>(bias_t->numel())};
-
-        // add shuffle before fc
-        std::vector<nvinfer1::ITensor*> reshape_before_fc_shape_tensor;
-        nvinfer1::ITensor* input_shape_tensor = Shape(input);
-
-        for (int i = 0; i < 5; i++) {
-          reshape_before_fc_shape_tensor.push_back(Add1DConstantLayer(1));
-        }
-        for (int i = 0; i < 3; i++) {
-          reshape_before_fc_shape_tensor[i] =
-              GetEleTensorOfShape(input_shape_tensor, i);
-        }
-        auto* reshape_before_fc_layer =
-            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
-        if (op_desc.HasAttr("Input_scale")) {
-          engine_->SetTensorDynamicRange(reshape_before_fc_layer->getOutput(0),
-                                         in_scale);
-        }
-        reshape_before_fc_layer->setInput(
-            1, *Concat(reshape_before_fc_shape_tensor));
-        reshape_before_fc_layer->setName(
-            ("shuffle_before_multihead_mamul(Output: " + output_name + ")")
-                .c_str());
-
-        // add layer fc
-        nvinfer1::ILayer* fc_layer = nullptr;
-        if (op_desc.HasAttr("Input_scale")) {
-          nvinfer1::DimsHW nv_ksize(1, 1);
-          fc_layer =
-              TRT_ENGINE_ADD_LAYER(engine_,
-                                   Convolution,
-                                   *reshape_before_fc_layer->getOutput(0),
-                                   n,
-                                   nv_ksize,
-                                   weight.get(),
-                                   bias.get());
-        } else {
+              };
+          memcpy(weight_data_tmp.data(),
+                 weight_data,
+                 weight_t->numel() * sizeof(float));
+          transpose_weight_v2(weight_data_tmp.data(),
+                              weight_data,
+                              three,
+                              head_number,
+                              head_size,
+                              hidden_in);
+
+          std::vector<float> bias_data_tmp;
+          bias_data_tmp.reserve(bias_t->numel());
+          memcpy(
+              bias_data_tmp.data(), bias_data, bias_t->numel() * sizeof(float));
+          transpose_bias_v2(
+              bias_data_tmp.data(), bias_data, head_number, head_size);
+
+          // add shuffle for FullyConnected layer
+          std::vector<nvinfer1::ITensor*> reshape_before_fc_shape_tensor;
+          nvinfer1::ITensor* input_shape_tensor = Shape(input);
+          for (int i = 0; i < 5; i++) {
+            reshape_before_fc_shape_tensor.push_back(Add1DConstantLayer(1));
+          }
+          for (int i = 0; i < 3; i++) {
+            reshape_before_fc_shape_tensor[i] =
+                GetEleTensorOfShape(input_shape_tensor, i);
+          }
+          auto* reshape_before_fc_layer =
+              TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
+          reshape_before_fc_layer->setInput(
+              1, *Concat(reshape_before_fc_shape_tensor));
+          reshape_before_fc_layer->setName(
+              ("shuffle_before_fc_multihead_matmul(Output: " + output_name +
+               ")")
+                  .c_str());
+
+          // add fc layer
+          nvinfer1::ILayer* fc_layer = nullptr;
           fc_layer =
               TRT_ENGINE_ADD_LAYER(engine_,
                                    FullyConnected,
                                    *reshape_before_fc_layer->getOutput(0),
                                    n,
-                                   weight.get(),
-                                   bias.get());
-        }
+                                   weight,
+                                   bias);
+
+          // add shuffle for CustomQKVToContextPluginDynamic layer
+          auto* reshape_after_fc_layer =
+              TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *fc_layer->getOutput(0));
+          std::vector<nvinfer1::ITensor*> mha_input_tensor_shape;
+          mha_input_tensor_shape.push_back(Add1DConstantLayer(-1));
+          mha_input_tensor_shape.push_back(
+              Add1DConstantLayer(hidden_out * 3));  // Q,K,V
+          mha_input_tensor_shape.push_back(Add1DConstantLayer(1));
+          mha_input_tensor_shape.push_back(Add1DConstantLayer(1));
+          reshape_after_fc_layer->setInput(1, *Concat(mha_input_tensor_shape));
+          reshape_after_fc_layer->setName(
+              ("shuffle_after_fc_multihead_matmul(Output: " + output_name + ")")
+                  .c_str());
+
+          // add mha_plugin
+          auto creator = GetPluginRegistry()->getPluginCreator(
+              "CustomQKVToContextPluginDynamic", "2");
+          assert(creator != nullptr);
+          // set the attributes of mha_plugin
+          int type = static_cast<int>(nvinfer1::DataType::kHALF);
+          int var_seqlen = 1;
+          bool has_mask = true;
+          std::vector<nvinfer1::PluginField> fields{
+              {"hidden_size",
+               &hidden_out,
+               nvinfer1::PluginFieldType::kINT32,
+               1},
+              {"num_heads", &head_number, nvinfer1::PluginFieldType::kINT32, 1},
+              {"type_id", &type, nvinfer1::PluginFieldType::kINT32, 1},
+              {"has_mask", &has_mask, nvinfer1::PluginFieldType::kINT32, 1},
+              {"var_seqlen",
+               &var_seqlen,
+               nvinfer1::PluginFieldType::kINT32,
+               1}};
+          nvinfer1::PluginFieldCollection* plugin_collection =
+              static_cast<nvinfer1::PluginFieldCollection*>(malloc(
+                  sizeof(*plugin_collection) +
+                  fields.size() *
+                      sizeof(nvinfer1::PluginField)));  // remember to free
+          plugin_collection->nbFields = static_cast<int>(fields.size());
+          plugin_collection->fields = fields.data();
+          auto plugin = creator->createPlugin("CustomQKVToContextPluginDynamic",
+                                              plugin_collection);
+          free(plugin_collection);
+          // set inputs
+          std::vector<nvinfer1::ITensor*> plugin_inputs;
+          // input_0 for plugin
+          plugin_inputs.emplace_back(reshape_after_fc_layer->getOutput(0));
+          // input_1(fake) for plugin
+          std::vector<int> mask = {1};
+          nvinfer1::ITensor* mask_tensor = Add1DConstantLayer(mask);
+          plugin_inputs.emplace_back(mask_tensor);
+          // input_2 for plugin
+          std::vector<int> pos_id = {0};
+          int max_batch = 500;
+          for (int i = 1; i < max_batch; i++) {
+            pos_id.push_back(i);
+          }
+          nvinfer1::ITensor* fake_pos_id_tensor = Add1DConstantLayer(pos_id);
+          nvinfer1::ITensor* length_tensor =
+              GetEleTensorOfShape(input_shape_tensor, 1);
+          auto pos_id_layer =
+              TRT_ENGINE_ADD_LAYER(engine_,
+                                   ElementWise,
+                                   *fake_pos_id_tensor,
+                                   *length_tensor,
+                                   nvinfer1::ElementWiseOperation::kPROD);
+          // size = batch + 1;
+          nvinfer1::ITensor* batch_tensor =
+              GetEleTensorOfShape(input_shape_tensor, 0);
+          std::vector<int> const_data = {1};
+          nvinfer1::ITensor* const_tensor = Add1DConstantLayer(const_data);
+          auto size_layer =
+              TRT_ENGINE_ADD_LAYER(engine_,
+                                   ElementWise,
+                                   *batch_tensor,
+                                   *const_tensor,
+                                   nvinfer1::ElementWiseOperation::kSUM);
+          // get size(batch + 1) data from pos_id_tensor
+          nvinfer1::Dims start;
+          nvinfer1::Dims stride;
+          nvinfer1::Dims size;
+
+          start.nbDims = 1;
+          stride.nbDims = 1;
+          size.nbDims = 1;
+
+          start.d[0] = 0;
+          stride.d[0] = 1;
+          size.d[0] = 1;
+
+          auto* slice_pos_layer = TRT_ENGINE_ADD_LAYER(
+              engine_, Slice, *pos_id_layer->getOutput(0), start, size, stride);
+          slice_pos_layer->setInput(2, *size_layer->getOutput(0));
+          plugin_inputs.emplace_back(slice_pos_layer->getOutput(0));
+
+          // input_3 for plugin
+          std::vector<int> data(500, 1);
+          nvinfer1::ITensor* fake_max_seqlen_tensor = Add1DConstantLayer(data);
+          auto* slice_max_layer = TRT_ENGINE_ADD_LAYER(
+              engine_, Slice, *fake_max_seqlen_tensor, start, size, stride);
+          slice_max_layer->setInput(2, *length_tensor);
+          plugin_inputs.emplace_back(slice_max_layer->getOutput(0));
+          // plugin_layer
+          auto plugin_layer = engine_->network()->addPluginV2(
+              plugin_inputs.data(), plugin_inputs.size(), *plugin);
+
+          // add shuffle
+          auto* reshape_after_mha_layer = TRT_ENGINE_ADD_LAYER(
+              engine_, Shuffle, *plugin_layer->getOutput(0));
+          std::vector<nvinfer1::ITensor*> reshape_tensor;
+          reshape_tensor.push_back(batch_tensor);
+          reshape_tensor.push_back(length_tensor);
+          reshape_tensor.push_back(Add1DConstantLayer(-1));
+          reshape_after_mha_layer->setInput(1, *Concat(reshape_tensor));
+          reshape_after_mha_layer->setName(
+              ("shuffle_last_multihead_matmul(Output: " + output_name + ")")
+                  .c_str());
 
-        if (op_desc.HasAttr("fc_out_threshold")) {
+          // return
+          layer = reshape_after_mha_layer;
+        } else {
           PADDLE_ENFORCE_EQ(
-              op_desc.HasAttr("fc_out_threshold"),
-              true,
+              input->getDimensions().nbDims,
+              3,
               platform::errors::InvalidArgument(
-                  "must have out threshold in multihead layers in int8 mode"));
-          float out_scale =
-              PADDLE_GET_CONST(float, op_desc.GetAttr("fc_out_threshold"));
-          engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale);
-        }
-        fc_layer->setName(
-            ("multihead_mamul_fc(Output: " + output_name + ")").c_str());
+                  "The Input dim of the MultiheadMatMul should be 3, "
+                  "but it's (%d) now.",
+                  input->getDimensions().nbDims));
+          // transpose weight_data from m * n to  n * m
+          auto* input_bias_qk =
+              engine_->GetITensor(op_desc.Input("BiasQK").front());
+
+          TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
+                                        static_cast<void*>(weight_data),
+                                        static_cast<size_t>(weight_t->numel())};
+          weight.dims.assign({n, m});
+
+          TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT,
+                                      static_cast<void*>(bias_data),
+                                      static_cast<size_t>(bias_t->numel())};
+
+          // add shuffle before fc
+          std::vector<nvinfer1::ITensor*> reshape_before_fc_shape_tensor;
+          nvinfer1::ITensor* input_shape_tensor = Shape(input);
+
+          for (int i = 0; i < 5; i++) {
+            reshape_before_fc_shape_tensor.push_back(Add1DConstantLayer(1));
+          }
+          for (int i = 0; i < 3; i++) {
+            reshape_before_fc_shape_tensor[i] =
+                GetEleTensorOfShape(input_shape_tensor, i);
+          }
+          auto* reshape_before_fc_layer =
+              TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
+          if (op_desc.HasAttr("Input_scale")) {
+            engine_->SetTensorDynamicRange(
+                reshape_before_fc_layer->getOutput(0), in_scale);
+          }
+          reshape_before_fc_layer->setInput(
+              1, *Concat(reshape_before_fc_shape_tensor));
+          reshape_before_fc_layer->setName(
+              ("shuffle_before_multihead_mamul(Output: " + output_name + ")")
+                  .c_str());
+
+          // add layer fc
+          nvinfer1::ILayer* fc_layer = nullptr;
+          if (op_desc.HasAttr("Input_scale")) {
+            nvinfer1::DimsHW nv_ksize(1, 1);
+            fc_layer =
+                TRT_ENGINE_ADD_LAYER(engine_,
+                                     Convolution,
+                                     *reshape_before_fc_layer->getOutput(0),
+                                     n,
+                                     nv_ksize,
+                                     weight.get(),
+                                     bias.get());
+          } else {
+            fc_layer =
+                TRT_ENGINE_ADD_LAYER(engine_,
+                                     FullyConnected,
+                                     *reshape_before_fc_layer->getOutput(0),
+                                     n,
+                                     weight.get(),
+                                     bias.get());
+          }
+
+          if (op_desc.HasAttr("fc_out_threshold")) {
+            PADDLE_ENFORCE_EQ(op_desc.HasAttr("fc_out_threshold"),
+                              true,
+                              platform::errors::InvalidArgument(
+                                  "must have out threshold in multihead layers "
+                                  "in int8 mode"));
+            float out_scale =
+                PADDLE_GET_CONST(float, op_desc.GetAttr("fc_out_threshold"));
+            engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale);
+          }
+          fc_layer->setName(
+              ("multihead_mamul_fc(Output: " + output_name + ")").c_str());
 
-        // no need to add shuffle after fc, just change it in
-        // QkvToContextPluginDynamic
+          // no need to add shuffle after fc, just change it in
+          // QkvToContextPluginDynamic
 
-        // add qkv to context
-        int head_size = hidden_out / head_number;
-        float scale = PADDLE_GET_CONST(float, op_desc.GetAttr("alpha"));
+          // add qkv to context
+          int head_size = hidden_out / head_number;
+          float scale = PADDLE_GET_CONST(float, op_desc.GetAttr("alpha"));
 
-        std::vector<nvinfer1::ITensor*> plugin_inputs;
-        plugin_inputs.push_back(fc_layer->getOutput(0));
-        plugin_inputs.push_back(input_bias_qk);
-        bool with_fp16 =
-            engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
+          std::vector<nvinfer1::ITensor*> plugin_inputs;
+          plugin_inputs.push_back(fc_layer->getOutput(0));
+          plugin_inputs.push_back(input_bias_qk);
+          bool with_fp16 =
+              engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
 
-        if (engine_->precision() == AnalysisConfig::Precision::kInt8) {
-          with_fp16 = true;
+          if (engine_->precision() == AnalysisConfig::Precision::kInt8) {
+            with_fp16 = true;
+          }
+          plugin::DynamicPluginTensorRT* plugin =
+              new plugin::QkvToContextPluginDynamic(
+                  hidden_in, head_number, head_size, scale, with_fp16);
+          layer = engine_->AddDynamicPlugin(plugin_inputs.data(), 2, plugin);
         }
-        plugin::DynamicPluginTensorRT* plugin =
-            new plugin::QkvToContextPluginDynamic(
-                hidden_in, head_number, head_size, scale, with_fp16);
-        layer = engine_->AddDynamicPlugin(plugin_inputs.data(), 2, plugin);
       }
     } else {
       PADDLE_THROW(platform::errors::Fatal(
diff --git a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
index 48b9d3229c38c..da4ebdc6cb6e8 100644
--- a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
@@ -53,6 +53,22 @@ int LayerNormPlugin::enqueue(int batch_size,
   int begin_norm_axis = begin_norm_axis_;
   float eps = eps_;
 
+  PADDLE_ENFORCE_EQ(1,
+                    mean_shape_.size(),
+                    platform::errors::InvalidArgument(
+                        "Size of mean_shape vector should be equal to 1,"
+                        "but got Size of mean_shape vector:%d",
+                        mean_shape_.size()));
+  PADDLE_ENFORCE_EQ(1,
+                    variance_shape_.size(),
+                    platform::errors::InvalidArgument(
+                        "Size of variance_shape vector should be equal to 1,"
+                        "but got Size of mean_shape vector:%d",
+                        mean_shape_.size()));
+
+  int64_t batched_mean_shape = mean_shape_[0] * input_dims.d[0];
+  int64_t batched_variance_shape = variance_shape_[0] * input_dims.d[0];
+
   std::vector<int> input_shape;
   input_shape.push_back(batch_size);
   for (int i = 0; i < input_dims.nbDims; i++) {
@@ -78,8 +94,8 @@ int LayerNormPlugin::enqueue(int batch_size,
 
   scale_t.Resize(phi::make_ddim({feature_size}));
   bias_t.Resize(phi::make_ddim({feature_size}));
-  mean_t.Resize(phi::make_ddim(mean_shape_));
-  variance_t.Resize(phi::make_ddim(variance_shape_));
+  mean_t.Resize(phi::make_ddim({batched_mean_shape}));
+  variance_t.Resize(phi::make_ddim({batched_variance_shape}));
   int device_id;
   cudaGetDevice(&device_id);
   float *scale_d = scale_t.mutable_data<float>(platform::CUDAPlace(device_id));
@@ -147,6 +163,20 @@ bool LayerNormPluginDynamic::supportsFormatCombination(
   return in.type == prev.type && in.format == prev.format;
 }
 
+void LayerNormPluginDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc *in,
+    int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc *out,
+    int nbOutputs) TRT_NOEXCEPT {
+  const auto &input_dims = in[0].desc.dims;
+  int statis_num = 1;
+  for (int i = 0; i < begin_norm_axis_; i++) {
+    statis_num *= input_dims.d[i];
+  }
+  mean_shape_[0] = statis_num;
+  variance_shape_[0] = statis_num;
+}
+
 nvinfer1::DataType LayerNormPluginDynamic::getOutputDataType(
     int index,
     const nvinfer1::DataType *input_types,
@@ -189,8 +219,19 @@ int LayerNormPluginDynamic::enqueue(
                         "Size of variance_shape vector should be equal to 1,"
                         "but got Size of mean_shape vector:%d",
                         mean_shape_.size()));
-  int64_t batched_mean_shape = mean_shape_[0] * input_dims.d[0];
-  int64_t batched_variance_shape = variance_shape_[0] * input_dims.d[0];
+  PADDLE_ENFORCE_GE(mean_shape_[0],
+                    0,
+                    platform::errors::InvalidArgument(
+                        "The size of mean vector should be positive,"
+                        "but got:%d",
+                        mean_shape_[0]));
+  PADDLE_ENFORCE_GE(variance_shape_[0],
+                    0,
+                    platform::errors::InvalidArgument(
+                        "The size of mean vector should be positive,"
+                        "but got:%d",
+                        variance_shape_[0]));
+
   const auto input_ddim = phi::make_ddim(input_shape);
   auto matrix_dim = phi::flatten_to_2d(input_ddim, begin_norm_axis);
   int feature_size = static_cast<int>(matrix_dim[1]);
@@ -217,8 +258,8 @@ int LayerNormPluginDynamic::enqueue(
     float *output = static_cast<float *>(outputs[0]);
     scale_t.Resize(phi::make_ddim({feature_size}));
     bias_t.Resize(phi::make_ddim({feature_size}));
-    mean_t.Resize(phi::make_ddim({batched_mean_shape}));
-    variance_t.Resize(phi::make_ddim({batched_variance_shape}));
+    mean_t.Resize(phi::make_ddim(mean_shape_));
+    variance_t.Resize(phi::make_ddim(variance_shape_));
 
     float *scale_d =
         scale_t.mutable_data<float>(platform::CUDAPlace(device_id));
diff --git a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h
index 64cfde8e4a76b..a8ccabb3cff59 100644
--- a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h
@@ -215,7 +215,7 @@ class LayerNormPluginDynamic : public DynamicPluginTensorRT {
   void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
                        int nbInputs,
                        const nvinfer1::DynamicPluginTensorDesc* out,
-                       int nbOutputs) TRT_NOEXCEPT override {}
+                       int nbOutputs) TRT_NOEXCEPT override;
 
   size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
                           int nbInputs,
diff --git a/paddle/fluid/jit/engine/pe_engine.cc b/paddle/fluid/jit/engine/pe_engine.cc
index ddc2de0fc530e..a2c6d69d16e4d 100644
--- a/paddle/fluid/jit/engine/pe_engine.cc
+++ b/paddle/fluid/jit/engine/pe_engine.cc
@@ -31,7 +31,7 @@ static ExecutionStrategy GetExecutionStrategy(const platform::Place &place) {
   auto device_type = platform::Place2DeviceType(place);
   switch (device_type) {
     case platform::DeviceType::CPU: {
-      execution_strategy.num_threads_ = 2;
+      execution_strategy.num_threads_ = 1;
       break;
     }
     case platform::DeviceType::CUDA: {
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index 6ccacddf070ed..4e0344b3b9391 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -86,6 +86,19 @@ class WhileOp : public framework::OperatorBase {
 
     std::set<std::string> no_copy_var_names;
     if (!is_test) {
+      // set all persistable parameters into no_copy_var_names.
+      auto *global_block = block;
+
+      while (global_block->ID() != 0)
+        global_block = global_block->ParentBlock();
+      auto all_vars = global_block->AllVars();
+      std::for_each(all_vars.begin(),
+                    all_vars.end(),
+                    [&no_copy_var_names](framework::VarDesc *var) {
+                      if (var->IsParameter())
+                        no_copy_var_names.insert(var->Name());
+                    });
+
       const std::vector<framework::OpDesc *> &all_ops = block->AllOps();
       for (const framework::OpDesc *op : all_ops) {
         const framework::VariableNameMap &input_var_names = op->Inputs();
diff --git a/paddle/fluid/operators/conv_transpose_op_xpu.cc b/paddle/fluid/operators/conv_transpose_op_xpu.cc
deleted file mode 100644
index 08a58678a2ea8..0000000000000
--- a/paddle/fluid/operators/conv_transpose_op_xpu.cc
+++ /dev/null
@@ -1,219 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/conv_transpose_op.h"
-#include "paddle/fluid/platform/device/device_wrapper.h"
-#include "paddle/phi/kernels/cpu/conv_util.h"
-
-#ifdef PADDLE_WITH_XPU
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-// target_len == 2 || target_len == 4
-inline std::vector<int> vector_extend(const std::vector<int>& src,
-                                      int target_len) {
-  if (target_len == 2 && src.size() == 1) {
-    return {src[0], src[0]};
-  }
-  if (target_len == 4 && src.size() == 1) {
-    return {src[0], src[0], src[0], src[0]};
-  }
-  if (target_len == 4 && src.size() == 2) {
-    return {src[0], src[0], src[1], src[1]};
-  }
-  return src;
-}
-
-template <typename DeviceContext, typename T>
-class Conv2DTransposeXPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    // The filter will be reshaped in the calculations,
-    // so here use an assignment operation,
-    // that avoids modifying the variable in the Scope.
-    Tensor filter = *context.Input<Tensor>("Filter");
-    Tensor* output = context.Output<Tensor>("Output");
-    output->mutable_data<T>(context.GetPlace());
-    int groups = context.Attr<int>("groups");
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    const std::string data_format = context.Attr<std::string>("data_format");
-    const std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-
-    PADDLE_ENFORCE_EQ(
-        data_format == "NHWC" || data_format == "NDHWC",
-        false,
-        platform::errors::InvalidArgument(
-            ("XPU do support data_format is NCHW in conv_transpose op.")));
-
-    framework::DDim in_data_dims =
-        phi::slice_ddim(input->dims(), 2, input->dims().size());
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter.dims(), 2, filter.dims().size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    phi::UpdatePaddingAndDilation(
-        &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
-
-    const int batch_size = static_cast<int>(input->dims()[0]);
-    const int img_yc = static_cast<int>(input->dims()[1]);
-    const int img_yh = static_cast<int>(input->dims()[2]);
-    const int img_yw = static_cast<int>(input->dims()[3]);
-    const int img_xc = static_cast<int>(output->dims()[1]);
-    const int img_xh = static_cast<int>(output->dims()[2]);
-    const int img_xw = static_cast<int>(output->dims()[3]);
-
-    {
-      std::vector<int> ksize_check = vector_extend(ksize, 2);
-      std::vector<int> stride_check = vector_extend(strides, 2);
-      std::vector<int> pad_check = vector_extend(paddings, 4);
-      std::vector<int> dilation_check = vector_extend(dilations, 2);
-
-      int xh_check = (img_yh - 1) * stride_check[0] - pad_check[0] -
-                     pad_check[1] +
-                     (dilation_check[0] * (ksize_check[0] - 1) + 1);
-      int xw_check = (img_yw - 1) * stride_check[1] - pad_check[2] -
-                     pad_check[3] +
-                     (dilation_check[1] * (ksize_check[1] - 1) + 1);
-
-      PADDLE_ENFORCE_EQ(
-          xh_check == img_xh && xw_check == img_xw,
-          true,
-          platform::errors::InvalidArgument(
-              ("XPU output size check error in conv_transpose op.")));
-    }
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    int r = xpu::conv2d_transpose<float, float, float, int16_t>(
-        dev_ctx.x_context(),
-        input->data<float>(),
-        filter.data<float>(),
-        output->data<float>(),
-        batch_size,
-        img_yc,
-        img_yh,
-        img_yw,
-        img_xc,
-        ksize,
-        strides,
-        paddings,
-        dilations,
-        groups,
-        nullptr,
-        nullptr,
-        nullptr,
-        true);
-    PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv2d_transpose");
-  }
-};
-
-template <typename DeviceContext, typename T>
-class Conv2DTransposeGradXPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    const Tensor* output_grad =
-        context.Input<Tensor>(framework::GradVarName("Output"));
-    Tensor* input_grad =
-        context.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad =
-        context.Output<Tensor>(framework::GradVarName("Filter"));
-    // The filter and filter_grad will be reshaped in the calculations,
-    // so here use an assignment operation,
-    // that avoids modifying the variable in the Scope.
-    Tensor filter = *context.Input<Tensor>("Filter");
-    if (!input_grad && !filter_grad) return;
-    int groups = context.Attr<int>("groups");
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    const std::string data_format = context.Attr<std::string>("data_format");
-    const std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-
-    PADDLE_ENFORCE_EQ(
-        data_format == "NHWC" || data_format == "NDHWC",
-        false,
-        platform::errors::InvalidArgument(
-            ("XPU do support data_format is NCHW in conv grad op.")));
-
-    framework::DDim in_data_dims =
-        phi::slice_ddim(input->dims(), 2, input->dims().size());
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter.dims(), 2, filter.dims().size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    phi::UpdatePaddingAndDilation(
-        &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
-
-    const int batch_size = static_cast<int>(input->dims()[0]);
-    const int img_yc = static_cast<int>(input->dims()[1]);
-    const int img_yh = static_cast<int>(input->dims()[2]);
-    const int img_yw = static_cast<int>(input->dims()[3]);
-    const int img_xc = static_cast<int>(output_grad->dims()[1]);
-    const int img_xh = static_cast<int>(output_grad->dims()[2]);
-    const int img_xw = static_cast<int>(output_grad->dims()[3]);
-    if (input_grad) {
-      input_grad->mutable_data<T>(context.GetPlace());
-    }
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(context.GetPlace());
-    }
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    int r = xpu::conv2d_transpose_grad<float, float, float, int16_t>(
-        dev_ctx.x_context(),
-        input->data<T>(),
-        filter.data<T>(),
-        output_grad->data<T>(),
-        input_grad ? input_grad->data<T>() : nullptr,
-        filter_grad ? filter_grad->data<T>() : nullptr,
-        batch_size,
-        img_yc,
-        img_yh,
-        img_yw,
-        img_xc,
-        img_xh,
-        img_xw,
-        ksize,
-        strides,
-        paddings,
-        dilations,
-        groups,
-        nullptr,
-        nullptr,
-        nullptr,
-        nullptr,
-        nullptr,
-        true);
-    PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv2d_transpose_grad");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_XPU_KERNEL(
-    conv2d_transpose,
-    ops::Conv2DTransposeXPUKernel<paddle::platform::XPUDeviceContext, float>);
-REGISTER_OP_XPU_KERNEL(
-    conv2d_transpose_grad,
-    ops::Conv2DTransposeGradXPUKernel<paddle::platform::XPUDeviceContext,
-                                      float>);
-#endif
diff --git a/paddle/fluid/operators/fill_any_like_op_xpu.cc b/paddle/fluid/operators/fill_any_like_op_xpu.cc
deleted file mode 100644
index 166067db1b463..0000000000000
--- a/paddle/fluid/operators/fill_any_like_op_xpu.cc
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_XPU
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/full_kernel.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class FillAnyLikeXPUKernel : public framework::OpKernel<T> {
- public:
-  using CommonType = typename std::common_type<
-      float,
-      typename std::conditional<std::is_same<T, platform::float16>::value,
-                                float,
-                                T>::type>::type;
-  using XPUInTDType = typename XPUTypeTrait<T>::Type;
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-
-    float value = context.Attr<float>("value");
-
-    auto common_type_value = static_cast<CommonType>(value);
-
-    PADDLE_ENFORCE_EQ(
-        (common_type_value >=
-         static_cast<CommonType>(std::numeric_limits<T>::lowest())) &&
-            (common_type_value <=
-             static_cast<CommonType>(std::numeric_limits<T>::max())),
-        true,
-        platform::errors::InvalidArgument(
-            "The filled value is out of range for target type, "
-            "current kernel type is %s, the range should between %f "
-            "and %f, but now value is %f.",
-            typeid(T).name(),
-            static_cast<CommonType>(std::numeric_limits<T>::lowest()),
-            static_cast<CommonType>(std::numeric_limits<T>::max()),
-            value));
-
-    PADDLE_ENFORCE_EQ(
-        std::isnan(value),
-        false,
-        platform::errors::InvalidArgument("The filled value is NaN."));
-
-    auto& dev_ctx =
-        context.template device_context<paddle::platform::XPUDeviceContext>();
-
-    // call phi kernel
-    phi::FullLikeKernel<T>(
-        static_cast<const typename paddle::framework::ConvertToPhiContext<
-            paddle::platform::XPUDeviceContext>::TYPE&>(dev_ctx),
-        *x,
-        value,
-        phi::DataType::UNDEFINED,
-        out);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_XPU_KERNEL(fill_any_like,
-                       ops::FillAnyLikeXPUKernel<int>,
-                       ops::FillAnyLikeXPUKernel<int64_t>,
-                       ops::FillAnyLikeXPUKernel<float>,
-                       ops::FillAnyLikeXPUKernel<paddle::platform::float16>);
-
-#endif
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op_mlu.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op_mlu.cc
index 32a19750f420a..93fd3c5f3ddbd 100644
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op_mlu.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op_mlu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/fill_constant_op.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
 #include "paddle/fluid/operators/utils.h"
 
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
index 02f89cfdd2691..479b2e19096e5 100644
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/fill_constant_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 68ea043d680cb..0dd0e1dcecf6b 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/fill_constant_op.h"
-
 #include <string>
 
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h
deleted file mode 100644
index bd8303fe402f4..0000000000000
--- a/paddle/fluid/operators/fill_constant_op.h
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <limits>
-#include <sstream>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/utils.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class FillConstantKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext &ctx) const override {
-    auto data_type =
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
-
-    auto str_value = ctx.Attr<std::string>("str_value");
-    auto float_value = ctx.Attr<float>("value");
-    auto force_cpu = ctx.Attr<bool>("force_cpu");
-    auto place_type = ctx.Attr<int>("place_type");
-    framework::Tensor *tensor = nullptr;
-
-    framework::Variable *out_var = ctx.OutputVar("Out");
-
-    T value;
-    if (str_value.empty()) {
-      value = static_cast<T>(float_value);
-    } else {
-      // handle NaN/Inf first, which cannot be read from stream.
-      if (str_value == "inf") {
-        value = static_cast<T>(std::numeric_limits<double>::infinity());
-      } else if (str_value == "-inf") {
-        value = static_cast<T>(-std::numeric_limits<double>::infinity());
-      } else if (str_value == "nan") {
-        value = static_cast<T>(std::numeric_limits<double>::quiet_NaN());
-      } else {
-        std::stringstream convert_stream(str_value);
-        if (std::is_same<int64_t, T>::value) {
-          int64_t tmp_value;
-          convert_stream >> tmp_value;
-          value = static_cast<T>(tmp_value);
-        } else {
-          double tmp_value;
-          convert_stream >> tmp_value;
-          value = static_cast<T>(tmp_value);
-        }
-      }
-    }
-    if (ctx.HasInput("ValueTensor")) {
-      auto *value_tensor = ctx.Input<framework::Tensor>("ValueTensor");
-      PADDLE_ENFORCE_EQ(
-          value_tensor->numel(),
-          1,
-          platform::errors::InvalidArgument(
-              "When use Tensor as value to set Tensor value in fill_cosntant, "
-              "value input(ValueTensor) size must be 1, but get %d",
-              value_tensor->numel()));
-      const T *tensor_data = value_tensor->data<T>();
-      framework::Tensor cpu_tensor;
-      auto tmp_place = value_tensor->place();
-      if (platform::is_gpu_place(tmp_place) ||
-          platform::is_xpu_place(tmp_place)) {
-        paddle::framework::TensorCopySync(
-            *value_tensor, platform::CPUPlace(), &cpu_tensor);
-        tensor_data = cpu_tensor.data<T>();
-      }
-      value = tensor_data[0];
-    }
-    auto shape = GetShape(ctx);
-
-    if (out_var->IsType<framework::LoDTensor>()) {
-      tensor = out_var->GetMutable<framework::LoDTensor>();
-      tensor->Resize(shape);
-    } else if (out_var->IsType<phi::SelectedRows>()) {
-      tensor = out_var->GetMutable<phi::SelectedRows>()->mutable_value();
-      tensor->Resize(shape);
-    } else {
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "In fill constant Op, the output only supports SelectedRows and "
-          "LoDTensor."));
-    }
-
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    int actual_place = place_type;
-
-    if (actual_place == -1) {
-      bool cpu_place = (force_cpu || ctx.GetPlace() == platform::CPUPlace() ||
-                        data_type == framework::proto::VarType::BF16);
-      if (cpu_place) {
-        actual_place = 0;
-      } else if (platform::is_gpu_place(ctx.GetPlace())) {
-        actual_place = 1;
-      } else if (platform::is_xpu_place(ctx.GetPlace())) {
-        actual_place = 3;
-      }
-    }
-
-    if (actual_place == 0) {
-      VLOG(4) << "[CPU] FillConstantKernel"
-              << ((data_type == framework::proto::VarType::BF16) ? "<bfloat16>"
-                                                                 : "<T>");
-      tensor->mutable_data(platform::CPUPlace(),
-                           framework::TransToPhiDataType(data_type));
-      phi::funcs::SetConstant<phi::CPUContext, T> functor;
-      auto &dev_ctx = *pool.Get(platform::CPUPlace());
-      functor(reinterpret_cast<const phi::CPUContext &>(dev_ctx),
-              tensor,
-              static_cast<T>(value));
-    } else if (actual_place == 1) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      tensor->mutable_data(ctx.GetPlace(),
-                           framework::TransToPhiDataType(data_type));
-      phi::funcs::SetConstant<phi::GPUContext, T> functor;
-      auto &dev_ctx = *pool.Get(ctx.GetPlace());
-      functor(reinterpret_cast<const phi::GPUContext &>(dev_ctx),
-              tensor,
-              static_cast<T>(value));
-#else
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "PaddlePaddle should compile with GPU."));
-#endif
-    } else if (actual_place == 2) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      tensor->mutable_data(platform::CUDAPinnedPlace(),
-                           framework::TransToPhiDataType(data_type));
-      phi::funcs::SetConstant<platform::CUDAPinnedDeviceContext, T> functor;
-      auto &dev_ctx = *pool.Get(platform::CUDAPinnedPlace());
-      functor(
-          reinterpret_cast<const platform::CUDAPinnedDeviceContext &>(dev_ctx),
-          tensor,
-          static_cast<T>(value));
-#else
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "PaddlePaddle should compile with GPU."));
-#endif
-    } else if (actual_place == 3) {
-#ifdef PADDLE_WITH_XPU
-      tensor->mutable_data(ctx.GetPlace(),
-                           framework::TransToPhiDataType(data_type));
-      phi::funcs::SetConstant<platform::XPUDeviceContext, T> functor;
-      auto &dev_ctx = *pool.Get(ctx.GetPlace());
-      functor(reinterpret_cast<const platform::XPUDeviceContext &>(dev_ctx),
-              tensor,
-              static_cast<T>(value));
-#else
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "PaddlePaddle should compile with XPU."));
-#endif
-    } else {
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Could NOT determine the place of variable, place_type = %d .",
-          actual_place));
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/fill_constant_op_mlu.cc b/paddle/fluid/operators/fill_constant_op_mlu.cc
index 4256945777bb1..e2b4cd547d24b 100644
--- a/paddle/fluid/operators/fill_constant_op_mlu.cc
+++ b/paddle/fluid/operators/fill_constant_op_mlu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/fill_constant_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc
index 097423afe9852..6e7f5e289142a 100644
--- a/paddle/fluid/operators/fill_constant_op_npu.cc
+++ b/paddle/fluid/operators/fill_constant_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/fill_constant_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
diff --git a/paddle/fluid/operators/fill_constant_op_xpu.cc b/paddle/fluid/operators/fill_constant_op_xpu.cc
deleted file mode 100644
index 62ca2e672f998..0000000000000
--- a/paddle/fluid/operators/fill_constant_op_xpu.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/fill_constant_op.h"
-
-namespace ops = paddle::operators;
-#ifdef PADDLE_WITH_XPU
-REGISTER_OP_XPU_KERNEL(
-    fill_constant,
-    ops::FillConstantKernel<float>,
-    ops::FillConstantKernel<double>,
-    ops::FillConstantKernel<uint8_t>,
-    ops::FillConstantKernel<int16_t>,
-    ops::FillConstantKernel<int>,
-    ops::FillConstantKernel<int64_t>,
-    ops::FillConstantKernel<bool>,
-    ops::FillConstantKernel<paddle::platform::float16>,
-    ops::FillConstantKernel<paddle::platform::complex<float>>,
-    ops::FillConstantKernel<paddle::platform::complex<double>>);
-#endif
diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc
index f4fea27aeadac..80964323e6b01 100644
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/operators/fill_constant_op.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
index 81b53c8b94976..4df716f79f2af 100644
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -17,7 +17,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/operators/fill_constant_op.h"
 #include "paddle/phi/kernels/funcs/index_impl.cu.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/gaussian_random_op_npu.cc b/paddle/fluid/operators/gaussian_random_op_npu.cc
index a0200d623ec76..8b3af57d923fe 100644
--- a/paddle/fluid/operators/gaussian_random_op_npu.cc
+++ b/paddle/fluid/operators/gaussian_random_op_npu.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/operators/fill_constant_op.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
diff --git a/paddle/fluid/operators/grid_sampler_op_xpu.cc b/paddle/fluid/operators/grid_sampler_op_xpu.cc
deleted file mode 100644
index 2843a90492cec..0000000000000
--- a/paddle/fluid/operators/grid_sampler_op_xpu.cc
+++ /dev/null
@@ -1,138 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifdef PADDLE_WITH_XPU
-
-#include <memory>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/device_wrapper.h"
-#include "paddle/fluid/platform/device/xpu/xpu_header.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class GridSamplerXPUKernel : public framework::OpKernel<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_xpu_place(context.GetPlace()),
-        true,
-        platform::errors::Unavailable("This kernel only runs on XPU."));
-
-    // input and output data
-    const Tensor* input = context.Input<Tensor>("X");
-    const Tensor* grid = context.Input<Tensor>("Grid");
-    Tensor* output = context.Output<Tensor>("Output");
-
-    int n = input->dims()[0];
-    int c = input->dims()[1];
-    int h = input->dims()[2];
-    int w = input->dims()[3];
-    int out_h = grid->dims()[1];
-    int out_w = grid->dims()[2];
-
-    // attrs
-    // paddle.nn.functional.grid_sample(x, grid, mode='bilinear',
-    // padding_mode='zeros', align_corners=True, name=None)
-    const std::string mode = context.Attr<std::string>("mode");
-    const std::string padding_mode = context.Attr<std::string>("padding_mode");
-    bool align_corners_bool = context.Attr<bool>("align_corners");
-    const std::string data_format =
-        paddle::framework::DataLayoutToString(input->layout());
-
-    // attr to real param
-    bool is_nearest_bool;
-    if (mode == "bilinear") {
-      is_nearest_bool = false;
-    } else if (mode == "nearest") {
-      is_nearest_bool = true;
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "should not reach here: mode should be either 'bilinear' or "
-          "'nearest', bot got %s.",
-          mode));
-    }
-
-    // attention: 0: zeros, 2: reflection, 1: border according to XDNN api.
-    int padding_mode_int;
-    if (padding_mode == "zeros") {
-      padding_mode_int = 0;
-    } else if (padding_mode == "reflection") {
-      padding_mode_int = 2;
-    } else if (padding_mode == "border") {
-      padding_mode_int = 1;
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "should not reach here: padding_mode should be either 'zeros' or "
-          "'reflection' or 'border', bot got %s.",
-          padding_mode));
-    }
-
-    bool is_nchw_bool;
-    if (data_format == "NCHW") {
-      is_nchw_bool = true;
-    } else if (data_format == "NHWC") {
-      is_nchw_bool = false;
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "should not reach here: data_format should be either 'NCHW' or "
-          "'NHWC', bot got %s.",
-          data_format));
-    }
-
-    // data pointers
-    const T* input_data = input->data<T>();
-    const T* grid_data = grid->data<T>();
-    T* output_data =
-        output->mutable_data<T>({n, c, out_h, out_w}, context.GetPlace());
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    // int grid_sample(Context* ctx, const T* x, const T* grid, T* y, int n, int
-    // c, int xh, int xw, int yh, int yw, bool is_nearest, bool align_corners,
-    // int padding_mode, bool is_nchw);
-    int r = xpu::grid_sample(dev_ctx.x_context(),
-                             input_data,
-                             grid_data,
-                             output_data,
-                             n,
-                             c,
-                             h,
-                             w,
-                             out_h,
-                             out_w,
-                             is_nearest_bool,
-                             align_corners_bool,
-                             padding_mode_int,
-                             is_nchw_bool);
-    PADDLE_ENFORCE_XDNN_SUCCESS(r, "grid_sampler");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_XPU_KERNEL(
-    grid_sampler,
-    ops::GridSamplerXPUKernel<paddle::platform::XPUDeviceContext, float>);
-
-#endif
diff --git a/paddle/fluid/operators/index_impl.cu.h b/paddle/fluid/operators/index_impl.cu.h
index dfecd0b0ef2b7..ab411ad8ac625 100644
--- a/paddle/fluid/operators/index_impl.cu.h
+++ b/paddle/fluid/operators/index_impl.cu.h
@@ -20,7 +20,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/operators/fill_constant_op.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
diff --git a/paddle/fluid/operators/load_combine_op_xpu.cc b/paddle/fluid/operators/load_combine_op_xpu.cc
new file mode 100644
index 0000000000000..9fa7ba3f7526a
--- /dev/null
+++ b/paddle/fluid/operators/load_combine_op_xpu.cc
@@ -0,0 +1,25 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/load_combine_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU_KERNEL(
+    load_combine,
+    ops::LoadCombineOpKernel<paddle::platform::XPUDeviceContext, float>,
+    ops::LoadCombineOpKernel<paddle::platform::XPUDeviceContext, double>,
+    ops::LoadCombineOpKernel<paddle::platform::XPUDeviceContext, int>,
+    ops::LoadCombineOpKernel<paddle::platform::XPUDeviceContext, int8_t>,
+    ops::LoadCombineOpKernel<paddle::platform::XPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
index d5c22b9a2952e..ad3289213f87c 100644
--- a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
@@ -15,7 +15,8 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/operators/fill_constant_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op_xpu.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op_xpu.cc
index 2a79d031dcb02..108bba3b4522a 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op_xpu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op_xpu.cc
@@ -112,6 +112,7 @@ class ReduceMeanGradXPUKernel : public framework::OpKernel<T> {
         d = d + xdims.size();
       }
       reduce_numel *= xdims[d];
+      ydims.insert(ydims.begin() + d, 1);
     }
 
     float val = 1.0f / static_cast<float>(reduce_numel);
diff --git a/paddle/fluid/operators/transfer_layout_op.cc b/paddle/fluid/operators/transfer_layout_op.cc
index 1e1ff3494d029..86862d4a10f7d 100644
--- a/paddle/fluid/operators/transfer_layout_op.cc
+++ b/paddle/fluid/operators/transfer_layout_op.cc
@@ -16,7 +16,11 @@
 
 #include <string>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace framework {
@@ -37,34 +41,6 @@ class TransferLayoutOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "TransferLayout");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "TransferLayout");
-
-    auto dst_layout = ctx->Attrs().Get<int>("dst_layout");
-    auto low_bound = static_cast<int>(framework::DataLayout::kAnyLayout);
-    auto upper_bound = static_cast<int>(framework::DataLayout::kMKLDNN);
-    PADDLE_ENFORCE_GE(
-        dst_layout,
-        low_bound,
-        platform::errors::PreconditionNotMet(
-            "Required dst_layout >= %d, but received dst_layout = %d",
-            low_bound,
-            dst_layout));
-    PADDLE_ENFORCE_LE(
-        dst_layout,
-        upper_bound,
-        platform::errors::PreconditionNotMet(
-            "Required dst_layout <= %d, but received dst_layout = %d",
-            upper_bound,
-            dst_layout));
-
-    // TODO(Aurelius84): Out's ddim is different with X because they have
-    // different layout
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -142,18 +118,18 @@ class TransferLayoutOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
+DECLARE_INFER_SHAPE_FUNCTOR(transfer_layout,
+                            TransferLayoutInferShapeFunctor,
+                            PD_INFER_META(phi::TransferLayoutInferMeta));
 REGISTER_OPERATOR(
     transfer_layout,
     ops::TransferLayoutOp,
     ops::TransferLayoutOpProtoMaker,
     ops::TransferLayoutInferVarType,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    TransferLayoutInferShapeFunctor);
 
-// dtype is not important
-REGISTER_OP_CPU_KERNEL_FUNCTOR(transfer_layout,
-                               float,
-                               ops::TransferLayoutKernel);
 REGISTER_OP_VERSION(transfer_layout)
     .AddCheckpoint(R"ROC(refine transfer_layout, add src_layout attribute)ROC",
                    paddle::framework::compatible::OpVersionDesc().NewAttr(
diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
index 84e075f37c550..7eb90515785e4 100644
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -326,6 +326,12 @@ XPUOpMap& get_kl2_ops() {
        XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"load_combine",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP64, XPUPlace()),
+                     pOpKernelType(vartype::INT8, XPUPlace()),
+                     pOpKernelType(vartype::INT32, XPUPlace()),
+                     pOpKernelType(vartype::INT64, XPUPlace())})},
       {"log", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"log_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"log_softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
diff --git a/paddle/fluid/pybind/.gitignore b/paddle/fluid/pybind/.gitignore
index a6f20e21801f7..21476bb94662a 100644
--- a/paddle/fluid/pybind/.gitignore
+++ b/paddle/fluid/pybind/.gitignore
@@ -8,4 +8,4 @@ op_function6.cc
 op_function7.cc
 op_function8.cc
 eager_op_function.cc
-eager_final_state_op_function.cc
+eager_legacy_op_function.cc
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 61e9ad9c9b36b..6aa2e7394c90b 100755
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -263,8 +263,9 @@ if(WITH_PYTHON)
 
   add_executable(op_function_generator op_function_generator.cc)
   target_link_libraries(op_function_generator ${OP_FUNCTION_GENERETOR_DEPS})
-  add_executable(eager_op_function_generator eager_op_function_generator.cc)
-  target_link_libraries(eager_op_function_generator
+  add_executable(eager_legacy_op_function_generator
+                 eager_legacy_op_function_generator.cc)
+  target_link_libraries(eager_legacy_op_function_generator
                         ${OP_FUNCTION_GENERETOR_DEPS})
   if(NOT WIN32)
     add_executable(kernel_signature_generator kernel_signature_generator.cc)
@@ -274,10 +275,11 @@ if(WITH_PYTHON)
 
   get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
   target_link_libraries(op_function_generator ${os_dependency_modules})
-  target_link_libraries(eager_op_function_generator ${os_dependency_modules})
+  target_link_libraries(eager_legacy_op_function_generator
+                        ${os_dependency_modules})
   if(WITH_ROCM)
     target_link_libraries(op_function_generator ${ROCM_HIPRTC_LIB})
-    target_link_libraries(eager_op_function_generator ${ROCM_HIPRTC_LIB})
+    target_link_libraries(eager_legacy_op_function_generator ${ROCM_HIPRTC_LIB})
     target_link_libraries(kernel_signature_generator ${ROCM_HIPRTC_LIB})
   endif()
 
@@ -300,7 +302,7 @@ if(WITH_PYTHON)
   set(tmp_impl_file8 ${impl_file8}.tmp)
   set(CODE_GEN_SPLIT_FILE_COUNT "8")
   set(eager_impl_file
-      ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/eager_op_function.cc)
+      ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/eager_legacy_op_function.cc)
   set(tmp_eager_impl_file ${eager_impl_file}.tmp)
 
   execute_process(
@@ -311,8 +313,8 @@ if(WITH_PYTHON)
       "${CODE_GEN_SPLIT_FILE_COUNT}")
 
   set(OP_IMPL_DEPS op_function_generator)
-  set(EAGER_OP_IMPL_DEPS eager_op_function_generator
-                         eager_final_state_python_c_codegen)
+  set(EAGER_OP_IMPL_DEPS eager_legacy_op_function_generator
+                         eager_python_c_codegen)
 
   if(WIN32)
     if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
@@ -342,13 +344,13 @@ if(WITH_PYTHON)
 
     file(
       WRITE
-      ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/eager_op_function_generator_retry.bat
+      ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/eager_legacy_op_function_generator_retry.bat
       ""
       "set build_times=1\n"
       ":retry\n"
-      "ECHO eager_op_function_generator run %build_times% time\n"
-      "taskkill /f /im eager_op_function_generator.exe 2>NUL\n"
-      "${op_impl_path}/eager_op_function_generator.exe ${tmp_eager_impl_file}\n"
+      "ECHO eager_legacy_op_function_generator run %build_times% time\n"
+      "taskkill /f /im eager_legacy_op_function_generator.exe 2>NUL\n"
+      "${op_impl_path}/eager_legacy_op_function_generator.exe ${tmp_eager_impl_file}\n"
       "if %ERRORLEVEL% NEQ 0 (\n"
       "    set /a build_times=%build_times%+1\n"
       "    if %build_times% GEQ 10 (\n"
@@ -436,7 +438,7 @@ if(WITH_PYTHON)
       add_custom_command(
         OUTPUT ${eager_impl_file}
         COMMAND
-          ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/eager_op_function_generator_retry.bat
+          ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/eager_legacy_op_function_generator_retry.bat
         COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_eager_impl_file}
                 ${eager_impl_file}
         COMMENT "copy_if_different ${tmp_eager_impl_file} to ${eager_impl_file}"
@@ -525,7 +527,7 @@ if(WITH_PYTHON)
         OUTPUT ${eager_impl_file}
         COMMAND
           ${CMAKE_COMMAND} -E env "LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:."
-          "${CMAKE_CURRENT_BINARY_DIR}/eager_op_function_generator"
+          "${CMAKE_CURRENT_BINARY_DIR}/eager_legacy_op_function_generator"
           "${tmp_eager_impl_file}"
         COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_eager_impl_file}
                 ${eager_impl_file}
@@ -536,7 +538,7 @@ if(WITH_PYTHON)
   endif()
   add_custom_target(op_function_generator_cmd ALL DEPENDS op_function)
   if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
-    add_custom_target(eager_op_function_generator_cmd ALL
+    add_custom_target(eager_legacy_op_function_generator_cmd ALL
                       DEPENDS ${eager_impl_file})
   endif()
 
@@ -549,8 +551,8 @@ if(WITH_PYTHON)
     set(PYBIND_SRCS eager_properties.cc ${PYBIND_SRCS})
     set(PYBIND_SRCS eager_utils.cc ${PYBIND_SRCS})
     set(PYBIND_SRCS eager_py_layer.cc ${PYBIND_SRCS})
+    set(PYBIND_SRCS eager_legacy_op_function.cc ${PYBIND_SRCS})
     set(PYBIND_SRCS eager_op_function.cc ${PYBIND_SRCS})
-    set(PYBIND_SRCS eager_final_state_op_function.cc ${PYBIND_SRCS})
     list(APPEND PYBIND_DEPS eager_api)
     list(APPEND PYBIND_DEPS autograd_meta)
     list(APPEND PYBIND_DEPS backward)
@@ -575,8 +577,8 @@ if(WITH_PYTHON)
     DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
 
   if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
-    add_dependencies(paddle_pybind eager_codegen)
-    add_dependencies(paddle_pybind eager_op_function_generator_cmd)
+    add_dependencies(paddle_pybind legacy_eager_codegen)
+    add_dependencies(paddle_pybind eager_legacy_op_function_generator_cmd)
   endif()
 
   if(NOT APPLE AND NOT WIN32)
diff --git a/paddle/fluid/pybind/eager_custom_python_api.h b/paddle/fluid/pybind/eager_custom_python_api.h
index 7ed58a1e956f6..1bb8fdd936064 100644
--- a/paddle/fluid/pybind/eager_custom_python_api.h
+++ b/paddle/fluid/pybind/eager_custom_python_api.h
@@ -15,40 +15,39 @@
 
 #include <iostream>
 
-#include "paddle/fluid/eager/to_static/run_program_op_func.h"
 #include "paddle/phi/core/enforce.h"
 
 namespace paddle {
 namespace pybind {
 
-static PyObject *eager_api_run_program(PyObject *self,
-                                       PyObject *args,
-                                       PyObject *kwargs) {
+static PyObject *eager_api_linear(PyObject *self,
+                                  PyObject *args,
+                                  PyObject *kwargs) {
   PyThreadState *tstate = nullptr;
   try {
-    auto X = GetTensorListFromArgs("run_program", "X", args, 0, false);
-    auto Params = GetTensorListFromArgs("run_program", "Params", args, 1, true);
-    auto Out = GetTensorPtrListFromArgs("run_program", "Out", args, 2, false);
-    auto OutScope =
-        GetScopePtrListFromArgs("run_program", "OutScope", args, 3, false);
-    auto DOut = GetTensorPtrListFromArgs("run_program", "DOut", args, 4, true);
-    framework::AttributeMap attrs;
-    // TODO(zengjinle): support CUDA Graph on eager mode
-    ConstructAttrMapFromPyArgs(
-        "run_program", args, 6, PyTuple_GET_SIZE(args), attrs);
-
+    auto x = GetTensorFromArgs("linear", "X", args, 0, false);
+    auto weight = GetTensorFromArgs("linear", "weight", args, 1, false);
+    auto bias = GetTensorFromArgs("linear", "Bias", args, 2, true);
     tstate = PyEval_SaveThread();
-    run_program_dygraph_function(X, Params, Out, OutScope, DOut, attrs);
-    PyEval_RestoreThread(tstate);
-    tstate = nullptr;
-    Py_RETURN_NONE;
+    if (bias.initialized()) {
+      auto mm_out = matmul_dygraph_function(x, weight, false, false);
+      auto out = add_dygraph_function(mm_out, bias);
+      PyEval_RestoreThread(tstate);
+      tstate = nullptr;
+      return ToPyObject(out);
+    } else {
+      auto mm_out = matmul_dygraph_function(x, weight, false, false);
+      PyEval_RestoreThread(tstate);
+      tstate = nullptr;
+      return ToPyObject(mm_out);
+    }
   } catch (paddle::platform::EnforceNotMet &exception) {
     if (tstate) {
       PyEval_RestoreThread(tstate);
     }
     std::ostringstream sout;
     sout << exception.what();
-    sout << "  [operator < run_program > error]";
+    sout << "  [operator < linear > error]";
     exception.set_error_str(sout.str());
     ThrowExceptionToPython(std::current_exception());
     return nullptr;
@@ -61,9 +60,9 @@ static PyObject *eager_api_run_program(PyObject *self,
   }
 }
 
-static PyMethodDef CustomEagerMethods[] = {
-    {"run_program",
-     (PyCFunction)(void (*)(void))eager_api_run_program,
+static PyMethodDef CustomEagerFinalStateMethods[] = {
+    {"linear",
+     (PyCFunction)(void (*)(void))eager_api_linear,
      METH_VARARGS | METH_KEYWORDS,
      "C++ interface function for run_program in dygraph."},
     {nullptr, nullptr, 0, nullptr}};
diff --git a/paddle/fluid/pybind/eager_final_state_custom_python_api.h b/paddle/fluid/pybind/eager_legacy_custom_python_api.h
similarity index 56%
rename from paddle/fluid/pybind/eager_final_state_custom_python_api.h
rename to paddle/fluid/pybind/eager_legacy_custom_python_api.h
index 4774b33a722d5..7ed58a1e956f6 100644
--- a/paddle/fluid/pybind/eager_final_state_custom_python_api.h
+++ b/paddle/fluid/pybind/eager_legacy_custom_python_api.h
@@ -15,41 +15,40 @@
 
 #include <iostream>
 
+#include "paddle/fluid/eager/to_static/run_program_op_func.h"
 #include "paddle/phi/core/enforce.h"
 
 namespace paddle {
 namespace pybind {
 
-static PyObject *eager_api_final_state_linear(PyObject *self,
-                                              PyObject *args,
-                                              PyObject *kwargs) {
+static PyObject *eager_api_run_program(PyObject *self,
+                                       PyObject *args,
+                                       PyObject *kwargs) {
   PyThreadState *tstate = nullptr;
   try {
-    auto x = GetTensorFromArgs("linear", "X", args, 0, false);
-    auto weight = GetTensorFromArgs("linear", "weight", args, 1, false);
-    auto bias = GetTensorFromArgs("linear", "Bias", args, 2, true);
+    auto X = GetTensorListFromArgs("run_program", "X", args, 0, false);
+    auto Params = GetTensorListFromArgs("run_program", "Params", args, 1, true);
+    auto Out = GetTensorPtrListFromArgs("run_program", "Out", args, 2, false);
+    auto OutScope =
+        GetScopePtrListFromArgs("run_program", "OutScope", args, 3, false);
+    auto DOut = GetTensorPtrListFromArgs("run_program", "DOut", args, 4, true);
+    framework::AttributeMap attrs;
+    // TODO(zengjinle): support CUDA Graph on eager mode
+    ConstructAttrMapFromPyArgs(
+        "run_program", args, 6, PyTuple_GET_SIZE(args), attrs);
+
     tstate = PyEval_SaveThread();
-    if (bias.initialized()) {
-      auto mm_out =
-          matmul_final_state_dygraph_function(x, weight, false, false);
-      auto out = add_final_state_dygraph_function(mm_out, bias);
-      PyEval_RestoreThread(tstate);
-      tstate = nullptr;
-      return ToPyObject(out);
-    } else {
-      auto mm_out =
-          matmul_final_state_dygraph_function(x, weight, false, false);
-      PyEval_RestoreThread(tstate);
-      tstate = nullptr;
-      return ToPyObject(mm_out);
-    }
+    run_program_dygraph_function(X, Params, Out, OutScope, DOut, attrs);
+    PyEval_RestoreThread(tstate);
+    tstate = nullptr;
+    Py_RETURN_NONE;
   } catch (paddle::platform::EnforceNotMet &exception) {
     if (tstate) {
       PyEval_RestoreThread(tstate);
     }
     std::ostringstream sout;
     sout << exception.what();
-    sout << "  [operator < linear > error]";
+    sout << "  [operator < run_program > error]";
     exception.set_error_str(sout.str());
     ThrowExceptionToPython(std::current_exception());
     return nullptr;
@@ -62,9 +61,9 @@ static PyObject *eager_api_final_state_linear(PyObject *self,
   }
 }
 
-static PyMethodDef CustomEagerFinalStateMethods[] = {
-    {"final_state_linear",
-     (PyCFunction)(void (*)(void))eager_api_final_state_linear,
+static PyMethodDef CustomEagerMethods[] = {
+    {"run_program",
+     (PyCFunction)(void (*)(void))eager_api_run_program,
      METH_VARARGS | METH_KEYWORDS,
      "C++ interface function for run_program in dygraph."},
     {nullptr, nullptr, 0, nullptr}};
diff --git a/paddle/fluid/pybind/eager_op_function_generator.cc b/paddle/fluid/pybind/eager_legacy_op_function_generator.cc
similarity index 96%
rename from paddle/fluid/pybind/eager_op_function_generator.cc
rename to paddle/fluid/pybind/eager_legacy_op_function_generator.cc
index 72c12b267d1c9..77a3d32d68564 100644
--- a/paddle/fluid/pybind/eager_op_function_generator.cc
+++ b/paddle/fluid/pybind/eager_legacy_op_function_generator.cc
@@ -320,10 +320,12 @@ std::string GenerateOpFunctionsBody(
       std::string inplace_arg_name = inplace_pair.second;
       std::string inplace_return_name = inplace_pair.first;
       const char* RETURN_INPLACE_TENSOR_TEMPLATE =
-          "    ssize_t arg_id = GetIdxFromCoreOpsInfoMap(core_ops_args_info, "
+          "    ssize_t arg_id = "
+          "GetIdxFromCoreOpsInfoMap(core_ops_legacy_args_info, "
           "\"%s\", \"%s\");\n"
           "    ssize_t return_id = "
-          "GetIdxFromCoreOpsInfoMap(core_ops_returns_info, \"%s\", \"%s\");\n"
+          "GetIdxFromCoreOpsInfoMap(core_ops_legacy_returns_info, \"%s\", "
+          "\"%s\");\n"
           "    inplace_var_idx_map[return_id] = arg_id;";
       return_str += paddle::string::Sprintf(RETURN_INPLACE_TENSOR_TEMPLATE,
                                             op_type,
@@ -361,7 +363,7 @@ static std::string GenerateCoreOpsInfoMap() {
       "  PyThreadState *tstate = nullptr;\n"
       "  try\n"
       "  {\n"
-      "    return ToPyObject(core_ops_args_info);\n"
+      "    return ToPyObject(core_ops_legacy_args_info);\n"
       "  }\n"
       "  catch(...) {\n"
       "    if (tstate) {\n"
@@ -376,7 +378,7 @@ static std::string GenerateCoreOpsInfoMap() {
       "  PyThreadState *tstate = nullptr;\n"
       "  try\n"
       "  {\n"
-      "    return ToPyObject(core_ops_args_type_info);\n"
+      "    return ToPyObject(core_ops_legacy_args_type_info);\n"
       "  }\n"
       "  catch(...) {\n"
       "    if (tstate) {\n"
@@ -391,7 +393,7 @@ static std::string GenerateCoreOpsInfoMap() {
       "  PyThreadState *tstate = nullptr;\n"
       "  try\n"
       "  {\n"
-      "    return ToPyObject(core_ops_returns_info);\n"
+      "    return ToPyObject(core_ops_legacy_returns_info);\n"
       "  }\n"
       "  catch(...) {\n"
       "    if (tstate) {\n"
@@ -429,7 +431,7 @@ GenerateOpFunctions() {
         !phi::KernelFactory::Instance().HasCompatiblePhiKernel(op_type)) {
       continue;
     }
-    std::string func_name = "eager_api_" + op_type;
+    std::string func_name = "eager_legacy_api_" + op_type;
     std::string op_function_str =
         GenerateOpFunctionsBody(op_proto, func_name, {});
 
@@ -461,7 +463,7 @@ GenerateOpFunctions() {
       }
 
       std::string inplace_op_type = op_type + "_";
-      std::string inplace_func_name = "eager_api_" + inplace_op_type;
+      std::string inplace_func_name = "eager_legacy_api_" + inplace_op_type;
       std::string inplace_op_function_str =
           GenerateOpFunctionsBody(op_proto, inplace_func_name, inplace_map);
 
@@ -500,7 +502,7 @@ int main(int argc, char* argv[]) {
       "\"paddle/fluid/platform/profiler/event_tracing.h\"",
       "\"paddle/fluid/pybind/exception.h\"",
       "\"paddle/fluid/pybind/op_function_common.h\"",
-      "\"paddle/fluid/pybind/eager_custom_python_api.h\"",
+      "\"paddle/fluid/pybind/eager_legacy_custom_python_api.h\"",
       "\"paddle/fluid/pybind/eager.h\""};
 
   std::ofstream out(argv[1], std::ios::out);
@@ -540,11 +542,12 @@ int main(int argc, char* argv[]) {
   out << "void BindEagerOpFunctions(pybind11::module *module) {\n"
       << "  InitOpsAttrTypeMap();\n"
       << "  auto m = module->def_submodule(\"ops\");\n"
-      << "  if (PyModule_AddFunctions(m.ptr(), ExtestMethods) < 0) {\n"
+      << "  auto legacy = m.def_submodule(\"legacy\");\n"
+      << "  if (PyModule_AddFunctions(legacy.ptr(), ExtestMethods) < 0) {\n"
       << "    PADDLE_THROW(platform::errors::Fatal (\"Add functions to "
          "core.eager.ops failed!\"));\n"
       << "  }\n\n"
-      << "  if (PyModule_AddFunctions(m.ptr(), CustomEagerMethods) < "
+      << "  if (PyModule_AddFunctions(legacy.ptr(), CustomEagerMethods) < "
          "0) {\n"
       << "    PADDLE_THROW(platform::errors::Fatal (\"Add functions to "
          "core.eager.ops failed!\"));\n"
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 847c2bd8b9d30..0e8bf1d0f8861 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -806,14 +806,14 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
                                            decrease_axis.end());
 
     if (op_type == "slice") {
-      out = slice_final_state_dygraph_function(self->tensor,
-                                               slice_axes_tmp,
-                                               slice_starts,
-                                               slice_ends,
-                                               infer_flags_tmp,
-                                               decrease_axis_tmp);
+      out = slice_dygraph_function(self->tensor,
+                                   slice_axes_tmp,
+                                   slice_starts,
+                                   slice_ends,
+                                   infer_flags_tmp,
+                                   decrease_axis_tmp);
     } else if (op_type == "strided_slice") {
-      out = strided_slice_final_state_dygraph_function(
+      out = strided_slice_dygraph_function(
           self->tensor, slice_axes, slice_starts, slice_ends, slice_strides);
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
@@ -852,7 +852,7 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
       }
 
       paddle::experimental::Tensor new_out;
-      new_out = unsqueeze_final_state_dygraph_function(out, none_axes);
+      new_out = unsqueeze_dygraph_function(out, none_axes);
       return ToPyObject(new_out);
     }
   }
@@ -868,8 +868,7 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
     paddle::framework::TensorFromVector(
         list_select_idxs, *dev_ctx, idx_tensor.get());
     framework::AttributeMap attrs = {{"dim", 0}};
-    out = index_select_final_state_dygraph_function(
-        self->tensor, select_index, 0);
+    out = index_select_dygraph_function(self->tensor, select_index, 0);
   }
 
   return ToPyObject(out);
diff --git a/paddle/fluid/pybind/generate_file_structures.py b/paddle/fluid/pybind/generate_file_structures.py
index bc61ecdcc96f5..869b27050a08c 100644
--- a/paddle/fluid/pybind/generate_file_structures.py
+++ b/paddle/fluid/pybind/generate_file_structures.py
@@ -20,7 +20,7 @@
     pybind_dir = sys.argv[1]
     split_count = int(sys.argv[2])
 
-    empty_files = [os.path.join(pybind_dir, "eager_final_state_op_function.cc")]
+    empty_files = [os.path.join(pybind_dir, "eager_legacy_op_function.cc")]
     empty_files.append(os.path.join(pybind_dir, "eager_op_function.cc"))
 
     for i in range(split_count):
diff --git a/paddle/fluid/pybind/op_function.h b/paddle/fluid/pybind/op_function.h
index 884136ec0d37b..542860fa0dc78 100644
--- a/paddle/fluid/pybind/op_function.h
+++ b/paddle/fluid/pybind/op_function.h
@@ -126,45 +126,6 @@ CastPyHandleToVarBaseList(const std::string& op_type,
   return result;
 }  // namespace pybind
 
-static inline void ConstructAttrMapFromPyArgs(const std::string& op_type,
-                                              int start_idx,
-                                              framework::AttributeMap* attrs,
-                                              const py::args& args) {
-  PADDLE_ENFORCE_EQ(
-      args.size() % 2,
-      0,
-      platform::errors::InvalidArgument(
-          "The number of arguments for arributes should be even."));
-  for (size_t i = 0; i < args.size(); i += 2) {
-    std::string name;
-    framework::Attribute value;
-    try {
-      name = args[i].cast<std::string>();
-    } catch (std::exception& e) {
-      PyObject* py_obj = args[i].ptr();  // get underlying PyObject
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "%s(): argument (position %d) must be str, but got "
-          "%s",
-          op_type,
-          start_idx + i,
-          Py_TYPE(py_obj)->tp_name));
-    }
-    try {
-      value = args[i + 1].cast<framework::Attribute>();
-    } catch (std::exception& e) {
-      PyObject* py_obj = args[i + 1].ptr();  // get underlying PyObject
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "%s(): argument (position %d) must be "
-          "Attribute type (one of str, bool, int, int64, float, or list of "
-          "them), but got %s",
-          op_type,
-          start_idx + i + 1,
-          Py_TYPE(py_obj)->tp_name));
-    }
-    (*attrs)[name] = value;
-  }
-}
-
 static inline std::vector<std::shared_ptr<imperative::VarBase>>
 ConstructDuplicableOutput(const size_t num) {
   auto tracer = imperative::GetCurrentTracer();
diff --git a/paddle/fluid/pybind/op_function_common.cc b/paddle/fluid/pybind/op_function_common.cc
index 28bdbf92d1827..e7970f69e5775 100644
--- a/paddle/fluid/pybind/op_function_common.cc
+++ b/paddle/fluid/pybind/op_function_common.cc
@@ -188,6 +188,14 @@ float CastPyArg2Float(PyObject* obj,
   return static_cast<float>(CastPyArg2Double(obj, op_type, arg_pos));
 }
 
+void CastPyArg2AttrFloat(PyObject* obj,
+                         paddle::framework::AttributeMap& attrs,  // NOLINT
+                         const std::string& key,
+                         const std::string& op_type,
+                         ssize_t arg_pos) {
+  attrs[key] = CastPyArg2Float(obj, op_type, arg_pos);
+}
+
 double CastPyArg2Double(PyObject* obj,
                         const std::string& op_type,
                         ssize_t arg_pos) {
@@ -196,7 +204,7 @@ double CastPyArg2Double(PyObject* obj,
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "%s(): argument (position %d) must be "
-        "float, but got %s",
+        "double, but got %s",
         op_type,
         arg_pos + 1,
         ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
@@ -205,12 +213,12 @@ double CastPyArg2Double(PyObject* obj,
   return 0.0;
 }
 
-void CastPyArg2AttrFloat(PyObject* obj,
-                         paddle::framework::AttributeMap& attrs,  // NOLINT
-                         const std::string& key,
-                         const std::string& op_type,
-                         ssize_t arg_pos) {
-  attrs[key] = CastPyArg2Float(obj, op_type, arg_pos);
+void CastPyArg2AttrDouble(PyObject* obj,
+                          paddle::framework::AttributeMap& attrs,  // NOLINT
+                          const std::string& key,
+                          const std::string& op_type,
+                          ssize_t arg_pos) {
+  attrs[key] = CastPyArg2Double(obj, op_type, arg_pos);
 }
 
 std::string CastPyArg2String(PyObject* obj,
@@ -735,6 +743,9 @@ void ConstructAttrMapFromPyArgs(
       case paddle::framework::proto::AttrType::FLOAT:
         CastPyArg2AttrFloat(obj, attrs, key, op_type, arg_pos);
         break;
+      case paddle::framework::proto::AttrType::FLOAT64:
+        CastPyArg2AttrDouble(obj, attrs, key, op_type, arg_pos);
+        break;
       case paddle::framework::proto::AttrType::STRING:
         CastPyArg2AttrString(obj, attrs, key, op_type, arg_pos);
         break;
diff --git a/paddle/fluid/pybind/op_function_common.h b/paddle/fluid/pybind/op_function_common.h
index 7bbfb8b85b82f..efa16494e7733 100644
--- a/paddle/fluid/pybind/op_function_common.h
+++ b/paddle/fluid/pybind/op_function_common.h
@@ -107,6 +107,12 @@ void CastPyArg2AttrFloat(PyObject* obj,
                          const std::string& op_type,
                          ssize_t arg_pos);
 
+void CastPyArg2AttrDouble(PyObject* obj,
+                          paddle::framework::AttributeMap& attrs,  // NOLINT
+                          const std::string& key,
+                          const std::string& op_type,
+                          ssize_t arg_pos);
+
 void CastPyArg2AttrString(PyObject* obj,
                           paddle::framework::AttributeMap& attrs,  // NOLINT
                           const std::string& key,
diff --git a/paddle/phi/core/enforce.cc b/paddle/phi/core/enforce.cc
index 4eb580955a97c..7d4efead49431 100644
--- a/paddle/phi/core/enforce.cc
+++ b/paddle/phi/core/enforce.cc
@@ -44,7 +44,8 @@ using Attribute = paddle::variant<paddle::blank,
                                   std::vector<int64_t>,
                                   std::vector<double>,
                                   VarDesc*,
-                                  std::vector<VarDesc*>>;
+                                  std::vector<VarDesc*>,
+                                  double>;
 using AttributeMap = std::unordered_map<std::string, Attribute>;
 }  // namespace framework
 namespace imperative {
diff --git a/paddle/phi/core/infermeta_utils.h b/paddle/phi/core/infermeta_utils.h
index de2dcd6909f4f..729c56352cf89 100644
--- a/paddle/phi/core/infermeta_utils.h
+++ b/paddle/phi/core/infermeta_utils.h
@@ -192,6 +192,7 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> {
   PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(int);
   PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(int64_t);
   PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(float);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(double);
   PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(DataType);
   PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(Backend);
   PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(DataLayout);
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 76142c4eea1af..9925a10e6dcb3 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -3649,11 +3649,13 @@ void TraceInferMeta(
 }
 
 void TransferLayoutInferMeta(const MetaTensor& x,
-                             DataLayout layout,
+                             int src_layout,
+                             int dst_layout,
                              MetaTensor* out) {
   out->set_dims(x.dims());
   out->set_dtype(x.dtype());
-  out->set_layout(layout);
+  out->set_layout(static_cast<DataLayout>(dst_layout));
+  out->share_lod(x);
 }
 
 void TransposeInferMeta(const MetaTensor& x,
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index 736360e740065..c0e20714a7ba4 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -519,7 +519,8 @@ void TraceInferMeta(
     const MetaTensor& x, int offset, int axis1, int axis2, MetaTensor* out);
 
 void TransferLayoutInferMeta(const MetaTensor& x,
-                             DataLayout layout,
+                             int src_layout,
+                             int dst_layout,
                              MetaTensor* out);
 
 void TransposeInferMeta(const MetaTensor& x,
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index d40b6f589c5f1..47ae390fb6f6c 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -66,7 +66,8 @@ set(COMMON_KERNEL_DEPS
     phi_dynload_warpctc
     sequence_padding
     sequence_scale
-    fft)
+    fft
+    phi_data_layout_transform)
 
 set(COMMON_KERNEL_DEPS
     ${COMMON_KERNEL_DEPS}
diff --git a/paddle/phi/kernels/cpu/embedding_kernel.cc b/paddle/phi/kernels/cpu/embedding_kernel.cc
index 0430f7a005221..7accdffe1340d 100644
--- a/paddle/phi/kernels/cpu/embedding_kernel.cc
+++ b/paddle/phi/kernels/cpu/embedding_kernel.cc
@@ -48,6 +48,10 @@ struct EmbeddingCPUFunctor {
     dev_ctx_.template Alloc<T>(out_);
     auto* output = out_->data<T>();
 
+#if defined(_OPENMP) && !defined(PADDLE_WITH_CUDA)
+#pragma omp parallel for
+#endif
+
     for (int64_t i = 0; i < ids_numel; ++i) {
       if (padding_idx_ != kNoPadding && ids[i] == padding_idx_) {
         memset(output + i * row_width, 0, row_width * sizeof(T));
diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt
index e21bea2e24269..122e4ba7feae4 100644
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -17,6 +17,11 @@ math_library(segment_pooling)
 math_library(sequence2batch)
 math_library(matrix_solve DEPS dense_tensor eigen3 blas math_function)
 
+cc_library(
+  phi_data_layout_transform
+  SRCS data_layout_transform.cc
+  DEPS tensor)
+
 if(WITH_GPU OR WITH_ROCM)
   if(MKL_FOUND AND WITH_ONEMKL)
     math_library(fft spectral_op.cu DEPS dynload_cuda dynload_mklrt
diff --git a/paddle/phi/kernels/funcs/data_layout_transform.cc b/paddle/phi/kernels/funcs/data_layout_transform.cc
new file mode 100644
index 0000000000000..800d67583e087
--- /dev/null
+++ b/paddle/phi/kernels/funcs/data_layout_transform.cc
@@ -0,0 +1,123 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/funcs/data_layout_transform.h"
+
+#include "glog/logging.h"
+
+#include "paddle/fluid/platform/profiler/event_tracing.h"
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/backends/onednn/onednn_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/phi/kernels/funcs/onednn/mkldnn_helper.h"
+#include "paddle/phi/kernels/funcs/onednn/mkldnn_reuse.h"
+#endif
+
+namespace phi {
+namespace funcs {
+
+#ifdef PADDLE_WITH_MKLDNN
+
+void* GetDataFromTensor(const DenseTensor& tensor,
+                        dnnl::memory::data_type type) {
+  switch (type) {
+    case dnnl::memory::data_type::f32:
+      return to_void_cast(tensor.data<float>());
+    case dnnl::memory::data_type::s8:
+      return to_void_cast(tensor.data<int8_t>());
+    case dnnl::memory::data_type::u8:
+      return to_void_cast(tensor.data<unsigned char>());
+    case dnnl::memory::data_type::s32:
+      return to_void_cast(tensor.data<int32_t>());
+    case dnnl::memory::data_type::bf16:
+      return to_void_cast(tensor.data<dtype::bfloat16>());
+    default:
+      PADDLE_THROW(errors::InvalidArgument("Wrong mkldnn type provided."));
+  }
+}
+
+void innerTransDataLayoutFromMKLDNN(DataLayout in_layout,
+                                    DataLayout out_layout,
+                                    const DenseTensor& in,
+                                    DenseTensor* out,
+                                    Place place,
+                                    bool always_copy) {
+  // Set default as NCHW in case not specified
+  out_layout = out_layout == DataLayout::ANY ? DataLayout::NCHW : out_layout;
+
+  auto& pool = DeviceContextPool::Instance();
+  auto* dev_ctx = dynamic_cast<OneDNNContext*>(pool.Get(place));
+  auto& cpu_engine = dev_ctx->GetEngine();
+
+  auto in_tz = vectorize<int64_t>(in.dims());
+  auto out_tz = in_tz;
+
+  auto in_type = ToMKLDNNDataType(in.dtype());
+  PADDLE_ENFORCE_NE(
+      in_type,
+      MKLDNNDataType::undef,
+      errors::InvalidArgument("Input tensor type (%s) is not supported.",
+                              in.dtype()));
+
+  auto out_format =
+      MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
+  dnnl::memory::desc out_mem_desc(out_tz, in_type, out_format);
+
+  // output tensor has the same dims as input. Reorder don't change dims
+  out->set_mem_desc(out_mem_desc);
+  out->Resize(in.dims());
+
+  if ((in.mem_desc() != out->mem_desc()) || always_copy) {
+    void* in_data = GetDataFromTensor(in, in_type);
+
+    ReorderMKLDNNHandler handler(in_tz, in.dtype(), in_type, cpu_engine);
+
+    auto reorder_src_memory_p =
+        handler.AcquireSrcMemory(in.mem_desc(), in_data);
+    auto reorder_dst_memory_p =
+        handler.AcquireDstMemory(out, out->mem_desc(), place);
+    auto reorder_p =
+        handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
+
+    auto& astream = OneDNNContext::tls().get_stream();
+    ::paddle::platform::RecordEvent record_reorder(
+        "ext_reorder",
+        ::paddle::platform::TracerEventType::UserDefined,
+        2,
+        ::paddle::platform::EventRole::kUniqueOp);
+    reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
+    astream.wait();
+  } else {
+    out->ShareDataWith(in);
+  }
+  // For exepected NHWC data format we need to reshape the Output tensor
+  // As MKL-DNN description was in NCHW and paddle is expecting NHWC
+  MatchShapeToLayout(out, in_layout, out_layout);
+
+  out->set_layout(DataLayout::kNCHW);
+  VLOG(10) << "out->layout: " << out->layout() << " in->dims: " << in.dims()
+           << " out->dims: " << out->dims();
+  // reset format since the out tensor will be feed to non-MKLDNN OPkernel
+  out->set_format(MKLDNNMemoryFormat::undef);
+}
+
+#endif
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/data_layout_transform.h b/paddle/phi/kernels/funcs/data_layout_transform.h
new file mode 100644
index 0000000000000..8fff3195b5cb4
--- /dev/null
+++ b/paddle/phi/kernels/funcs/data_layout_transform.h
@@ -0,0 +1,75 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef PADDLE_WITH_MKLDNN
+#include "dnnl.hpp"  // NOLINT
+#endif
+
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+namespace funcs {
+
+#ifdef PADDLE_WITH_MKLDNN
+
+using MKLDNNDataType = dnnl::memory::data_type;
+using MKLDNNMemoryFormat = dnnl::memory::format_tag;
+
+inline MKLDNNMemoryFormat ToMKLDNNFormat(const DataLayout& layout) {
+  switch (layout) {
+    case DataLayout::NHWC:
+      return MKLDNNMemoryFormat::nhwc;
+    case DataLayout::NCHW:
+      return MKLDNNMemoryFormat::nchw;
+    case DataLayout::NCDHW:
+      return MKLDNNMemoryFormat::ncdhw;
+    case DataLayout::NDHWC:
+      return MKLDNNMemoryFormat::ndhwc;
+    default:
+      PADDLE_THROW(errors::InvalidArgument(
+          "Fail to convert layout %s to MKLDNN format.",
+          ::paddle::framework::DataLayoutToString(layout)));
+  }
+}
+
+// Caution: proto::VarType::Type -> phi::DataType after transfer
+inline MKLDNNDataType ToMKLDNNDataType(DataType type) {
+  static std::unordered_map<DataType, MKLDNNDataType> dict{
+      {DataType::FLOAT32, MKLDNNDataType::f32},
+      {DataType::INT8, MKLDNNDataType::s8},
+      {DataType::UINT8, MKLDNNDataType::u8},
+      {DataType::INT32, MKLDNNDataType::s32},
+      {DataType::BFLOAT16, MKLDNNDataType::bf16}};
+  auto iter = dict.find(type);
+  if (iter != dict.end()) return iter->second;
+  return MKLDNNDataType::undef;
+}
+
+void innerTransDataLayoutFromMKLDNN(DataLayout in_layout,
+                                    DataLayout out_layout,
+                                    const DenseTensor& in,
+                                    DenseTensor* out,
+                                    Place place,
+                                    bool always_copy = false);
+void* GetDataFromTensor(const DenseTensor& tensor, MKLDNNDataType type);
+
+#endif
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/onednn/mkldnn_helper.h b/paddle/phi/kernels/funcs/onednn/mkldnn_helper.h
new file mode 100644
index 0000000000000..9a0aa8194c4e7
--- /dev/null
+++ b/paddle/phi/kernels/funcs/onednn/mkldnn_helper.h
@@ -0,0 +1,134 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "dnnl.hpp"  // NOLINT
+#include "glog/logging.h"
+
+#include "paddle/phi/backends/onednn/onednn_context.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+namespace funcs {
+
+using MKLDNNMemoryFormat = dnnl::memory::format_tag;
+using MKLDNNDataType = dnnl::memory::data_type;
+
+template <typename Type>
+void* to_void_cast(const Type* t) {
+  return static_cast<void*>(const_cast<Type*>(t));
+}
+
+inline MKLDNNMemoryFormat MKLDNNFormatForSize(size_t dims_size,
+                                              MKLDNNMemoryFormat data_format) {
+  if (dims_size == 1) {
+    return MKLDNNMemoryFormat::x;
+  } else if (dims_size == 2) {
+    return MKLDNNMemoryFormat::nc;
+  } else if (dims_size == 3) {
+    if (data_format == MKLDNNMemoryFormat::nchw) {
+      return MKLDNNMemoryFormat::ncw;
+    } else if (data_format == MKLDNNMemoryFormat::nhwc) {
+      return MKLDNNMemoryFormat::nwc;
+    }
+  } else if (dims_size == 4) {
+    if (data_format == MKLDNNMemoryFormat::goihw) {
+      return MKLDNNMemoryFormat::oihw;
+    }
+  } else if (dims_size == 5) {
+    if (data_format == MKLDNNMemoryFormat::goidhw) {
+      return MKLDNNMemoryFormat::oidhw;
+    }
+    if (data_format == MKLDNNMemoryFormat::nchw) {
+      return MKLDNNMemoryFormat::ncdhw;
+    } else if (data_format == MKLDNNMemoryFormat::nhwc) {
+      return MKLDNNMemoryFormat::ndhwc;
+    }
+  } else if (dims_size == 6) {
+    if (data_format == MKLDNNMemoryFormat::nchw) {
+      return MKLDNNMemoryFormat::abcdef;
+    }
+  }
+  return data_format;
+}
+
+inline void MatchShapeToLayout(DenseTensor* tensor_in,
+                               DataLayout from,
+                               DataLayout to) {
+  auto print_dims = [](const std::vector<int>& dims) {
+    std::ostringstream oss;
+
+    if (!dims.empty()) {
+      oss << "[";
+      // Convert all but the last element to avoid a trailing ","
+      std::copy(
+          dims.begin(), dims.end() - 1, std::ostream_iterator<int>(oss, ","));
+
+      // Now add the last element with no delimiter
+      oss << dims.back() << "]";
+    }
+
+    return oss.str();
+  };
+
+  // In these data layouts, channel dimension is either on 2nd position: nChw or
+  // at last nhwC, so for dim==2 these layouts are the same and nothing should
+  // be done. Similarly for dim==1 when you have just one possible combination.
+  if (tensor_in->dims().size() < 3) {
+    VLOG(3) << "Keeping MKLDNN/NHWC/NDHWC output_shape"
+            << print_dims(phi::vectorize<int>(tensor_in->dims()));
+    return;
+  }
+
+  switch (from) {
+    case DataLayout::MKLDNN:
+      if ((to == DataLayout::NHWC) || (to == DataLayout::NDHWC)) {
+        auto dims = phi::vectorize<int>(tensor_in->dims());
+        std::rotate(dims.begin() + 1, dims.begin() + 2, dims.end());
+        tensor_in->Resize(phi::make_ddim(dims));
+        VLOG(3) << "Rotating Shape from: MKLDNN to: NHWC/NDHWC output_shape"
+                << print_dims(dims);
+      }
+      break;
+    case DataLayout::NHWC:
+    case DataLayout::NDHWC:
+      if (to == DataLayout::MKLDNN) {
+        auto dims = phi::vectorize<int>(tensor_in->dims());
+        std::rotate(dims.begin() + 1, dims.end() - 1, dims.end());
+        tensor_in->Resize(phi::make_ddim(dims));
+        VLOG(3) << "Rotating Shape from: NHWC/NDHWC to: MKLDNN output_shape"
+                << print_dims(dims);
+      }
+      break;
+    default:
+      break;
+  }
+}
+
+struct mkldnn_dummy_primitive {
+  struct primitive_desc {};
+  struct desc {};
+};
+
+inline dnnl::memory::desc MKLDNNMemDesc(const std::vector<int64_t>& dims,
+                                        dnnl::memory::data_type data_type,
+                                        MKLDNNMemoryFormat format) {
+  return dnnl::memory::desc({dims}, data_type, format);
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/onednn/mkldnn_reuse.h b/paddle/phi/kernels/funcs/onednn/mkldnn_reuse.h
index 96333132508c4..56f2da3b3bd0c 100644
--- a/paddle/phi/kernels/funcs/onednn/mkldnn_reuse.h
+++ b/paddle/phi/kernels/funcs/onednn/mkldnn_reuse.h
@@ -20,11 +20,12 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "paddle/fluid/framework/data_layout_transform.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/phi/backends/onednn/onednn_context.h"
+#include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/onednn/mkldnn_helper.h"
 
 namespace phi {
 namespace funcs {
@@ -33,10 +34,12 @@ using user_function = std::function<std::shared_ptr<float>(const float*)>;
 using memory = dnnl::memory;
 using Place = phi::Place;
 
+using MKLDNNMemoryFormat = dnnl::memory::format_tag;
+
 template <typename T,
           typename TForward,
-          typename TBackward = paddle::platform::mkldnn_dummy_primitive,
-          typename TBackward_params = paddle::platform::mkldnn_dummy_primitive>
+          typename TBackward = mkldnn_dummy_primitive,
+          typename TBackward_params = mkldnn_dummy_primitive>
 class MKLDNNHandlerNoCachingT {
  public:
   MKLDNNHandlerNoCachingT(dnnl::engine engine, Place cpu_place)
@@ -62,8 +65,8 @@ class MKLDNNHandlerNoCachingT {
 
   std::shared_ptr<dnnl::memory> AcquireSrcMemory(const DenseTensor* input) {
     const T* input_data = input->data<T>();
-    return this->AcquireMemoryFromPrimitive(
-        fwd_pd_->src_desc(), paddle::platform::to_void_cast<T>(input_data));
+    return this->AcquireMemoryFromPrimitive(fwd_pd_->src_desc(),
+                                            to_void_cast<T>(input_data));
   }
 
   template <typename T_out = T>
@@ -81,16 +84,15 @@ class MKLDNNHandlerNoCachingT {
   template <typename T_out = T>
   std::shared_ptr<dnnl::memory> AcquireDstMemory(const DenseTensor* output) {
     const T_out* output_data = output->data<T_out>();
-    return this->AcquireMemoryFromPrimitive(
-        bwd_pd_->dst_desc(),
-        paddle::platform::to_void_cast<T_out>(output_data));
+    return this->AcquireMemoryFromPrimitive(bwd_pd_->dst_desc(),
+                                            to_void_cast<T_out>(output_data));
   }
 
   std::shared_ptr<dnnl::memory> AcquireDiffDstMemory(
       const DenseTensor* diffdst) {
     const T* ptr = diffdst->data<T>();
-    return this->AcquireMemoryFromPrimitive(
-        bwd_pd_->diff_dst_desc(), paddle::platform::to_void_cast<T>(ptr));
+    return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_dst_desc(),
+                                            to_void_cast<T>(ptr));
   }
 
   std::shared_ptr<dnnl::memory> AcquireDiffSrcMemory(DenseTensor* diffsrc) {
@@ -291,10 +293,110 @@ class ActivationMKLDNNHandler
   std::shared_ptr<dnnl::memory> AcquireBackwardSrcMemory(
       const DenseTensor* input) {
     const T* input_data = input->data<T>();
-    return this->AcquireMemoryFromPrimitive(
-        this->bwd_pd_->src_desc(),
-        paddle::platform::to_void_cast<T>(input_data));
+    return this->AcquireMemoryFromPrimitive(this->bwd_pd_->src_desc(),
+                                            to_void_cast<T>(input_data));
+  }
+};
+
+class ReorderMKLDNNHandler {
+ public:
+  ReorderMKLDNNHandler(std::vector<int64_t>& dims,  // NOLINT
+                       DataType ptype,
+                       dnnl::memory::data_type dtype,
+                       dnnl::engine engine)
+      : dims_(dims),
+        ptype_(ptype),
+        ptype_dst_(ptype),
+        dtype_(dtype),
+        dtype_dst_(dtype),
+        engine_(engine) {}
+
+  ReorderMKLDNNHandler(std::vector<int64_t>& dims,  // NOLINT
+                       DataType ptype,
+                       dnnl::memory::data_type dtype,
+                       DataType ptype_dst,
+                       dnnl::memory::data_type dtype_dst,
+                       dnnl::engine engine)
+      : dims_(dims),
+        ptype_(ptype),
+        ptype_dst_(ptype_dst),
+        dtype_(dtype),
+        dtype_dst_(dtype_dst),
+        engine_(engine) {}
+
+  std::shared_ptr<dnnl::memory> AcquireSrcMemory(const dnnl::memory::desc& md,
+                                                 void* ptr) {
+    return std::make_shared<dnnl::memory>(md, engine_, ptr);
+  }
+
+  std::shared_ptr<dnnl::memory> AcquireSrcMemory(const MKLDNNMemoryFormat& fmt,
+                                                 void* ptr) {
+    auto md = dnnl::memory::desc(dims_, dtype_, fmt);
+    return std::make_shared<dnnl::memory>(md, engine_, ptr);
+  }
+
+  std::shared_ptr<dnnl::memory> AcquireSubmemory(
+      const std::vector<int64_t>& dims,
+      const std::vector<int64_t>& offset,
+      const std::shared_ptr<dnnl::memory>& mem_p) {
+    auto sub_md = mem_p->get_desc().submemory_desc(dims, {offset});
+    auto sub_mem_p = std::make_shared<dnnl::memory>(
+        sub_md, engine_, mem_p->get_data_handle());
+    return sub_mem_p;
+  }
+
+  std::shared_ptr<dnnl::memory> AcquireDstMemory(DenseTensor* output,
+                                                 const MKLDNNMemoryFormat& fmt,
+                                                 Place place) {
+    auto dst_md = MKLDNNMemDesc(dims_, dtype_dst_, fmt);
+    auto dst_data = output->mutable_data(place, ptype_dst_, dst_md.get_size());
+    return std::make_shared<dnnl::memory>(dst_md, engine_, dst_data);
   }
+
+  std::shared_ptr<dnnl::memory> AcquireDstMemory(
+      DenseTensor* output, const dnnl::memory::desc& src_md, Place place) {
+    if (ptype_dst_ == ptype_) {
+      auto dst_data =
+          output->mutable_data(place, ptype_dst_, src_md.get_size());
+      return std::make_shared<dnnl::memory>(src_md, engine_, dst_data);
+    } else {
+      auto dst_md = src_md;
+      dst_md.data.data_type = static_cast<dnnl_data_type_t>(dtype_dst_);
+      auto dst_data =
+          output->mutable_data(place, ptype_dst_, dst_md.get_size());
+      return std::make_shared<dnnl::memory>(dst_md, engine_, dst_data);
+    }
+  }
+
+  std::shared_ptr<dnnl::memory> AcquireDstMemory(
+      DenseTensor* output,
+      const std::vector<int64_t>& dims,
+      const MKLDNNMemoryFormat& fmt,
+      Place place) {
+    auto dst_md = MKLDNNMemDesc(dims, dtype_dst_, fmt);
+    auto dst_data = output->mutable_data(place, ptype_dst_, dst_md.get_size());
+    return std::make_shared<dnnl::memory>(dst_md, engine_, dst_data);
+  }
+
+  std::shared_ptr<dnnl::reorder> AcquireReorder(
+      std::shared_ptr<dnnl::memory> dst_memory_p,
+      std::shared_ptr<dnnl::memory> src_memory_p) {
+    return std::make_shared<dnnl::reorder>(*(src_memory_p), *(dst_memory_p));
+  }
+
+  std::shared_ptr<dnnl::reorder> AcquireReorder(
+      std::shared_ptr<dnnl::memory> dst_memory_p,
+      std::shared_ptr<dnnl::memory> src_memory_p,
+      const dnnl::primitive_attr& attrs) {
+    return std::make_shared<dnnl::reorder>(
+        *(src_memory_p), *(dst_memory_p), attrs);
+  }
+
+ private:
+  std::vector<int64_t> dims_;
+  DataType ptype_, ptype_dst_;
+  dnnl::memory::data_type dtype_, dtype_dst_;
+  dnnl::engine engine_;
 };
 
 }  // namespace funcs
diff --git a/paddle/phi/kernels/transfer_layout_kernel.cc b/paddle/phi/kernels/transfer_layout_kernel.cc
index f7ecf379fdfa9..2110a06f16172 100644
--- a/paddle/phi/kernels/transfer_layout_kernel.cc
+++ b/paddle/phi/kernels/transfer_layout_kernel.cc
@@ -14,11 +14,17 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/transfer_layout_kernel.h"
 
+#include <sstream>
+#include <string>
+
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/visit_type.h"
+#include "paddle/phi/kernels/funcs/data_layout_transform.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/phi/kernels/funcs/onednn/mkldnn_helper.h"
+#endif
 namespace phi {
 
 std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to) {
@@ -46,10 +52,10 @@ void CastDataLayout(const Context& dev_ctx,
 }
 
 template <typename Context>
-void TransferLayoutKernel(const Context& dev_ctx,
-                          const DenseTensor& x,
-                          DataLayout dst_layout,
-                          DenseTensor* out) {
+void TransferLayoutGeneral(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           DataLayout dst_layout,
+                           DenseTensor* out) {
   auto src_dim = x.dims();
 
   auto axis = GetAxis(x.layout(), dst_layout);
@@ -60,16 +66,110 @@ void TransferLayoutKernel(const Context& dev_ctx,
     dst_dim[i] = src_dim[axis[i]];
   }
 
-  out->ResizeAndAllocate(phi::make_ddim(dst_dim));
+  out->Resize(phi::make_ddim(dst_dim));
+  dev_ctx.Alloc(out, x.dtype());
 
   PD_VISIT_ALL_TYPES(x.dtype(), "CastDataLayout", ([&] {
                        CastDataLayout<data_t, Context>(dev_ctx, x, axis, out);
                      }));
 }
 
+#ifdef PADDLE_WITH_MKLDNN
+template <typename Context>
+void TransferLayoutMKLDNN(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          DataLayout src_layout,
+                          DataLayout dst_layout,
+                          DenseTensor* out) {
+  auto print_tensor_meta = [](const DenseTensor& x) {
+    std::ostringstream oss;
+
+    oss << "[";
+    oss << "layout:" << x.layout() << " ,";
+    oss << "dims:" << x.dims() << " ,";
+    if (x.IsInitialized()) oss << "place:" << x.place();
+    oss << "]";
+
+    return oss.str();
+  };
+  VLOG(10) << " x: " << print_tensor_meta(x);
+  VLOG(10) << " out: " << print_tensor_meta(*out) << " " << out;
+
+  // NOTE(zhiqiu): to handle the special case in ApplyDataTransform() in
+  // data_transfer.cc
+  if (!x.IsInitialized() && src_layout == DataLayout::MKLDNN &&
+      dst_layout == DataLayout::NHWC) {
+    VLOG(4) << src_layout << "->" << dst_layout << " " << x.layout();
+    out->Resize(x.dims());
+    out->set_layout(dst_layout);
+    funcs::MatchShapeToLayout(out, src_layout, dst_layout);
+    return;
+  }
+
+  if (src_layout != DataLayout::MKLDNN && dst_layout == DataLayout::MKLDNN) {
+    // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
+    // Just set layout/format. No real transform occur
+    auto out_format = funcs::MKLDNNFormatForSize(
+        x.dims().size(), funcs::ToMKLDNNFormat(src_layout));
+
+    out->ShareDataWith(x);
+    // For NHWC data we need reshape of tensors as MKL-DNN
+    // is expecting NHWC dims description order
+    if (src_layout == DataLayout::NHWC) {
+      VLOG(4) << "NHWC";
+      funcs::MatchShapeToLayout(out, src_layout, dst_layout);
+      OneDNNContext::tls().set_cur_paddle_data_layout(src_layout);
+    }
+
+    out->set_layout(DataLayout::MKLDNN);
+    out->set_format(out_format);
+  } else if (src_layout == DataLayout::MKLDNN &&
+             dst_layout != DataLayout::MKLDNN) {
+    // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
+    // Do transform via MKLDNN lib
+    funcs::innerTransDataLayoutFromMKLDNN(
+        src_layout, dst_layout, x, out, dev_ctx.GetPlace());
+  } else if (src_layout == DataLayout::MKLDNN &&
+             dst_layout == DataLayout::MKLDNN) {
+    PADDLE_ENFORCE_NE(
+        src_layout,
+        dst_layout,
+        errors::PreconditionNotMet(
+            "No layout transform needed between two MKLDNN OPKernels."));
+  } else {
+    TransferLayoutGeneral<Context>(dev_ctx, x, dst_layout, out);
+  }
+}
+#endif
+
+template <typename Context>
+void TransferLayoutKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          int src_layout,
+                          int dst_layout,
+                          DenseTensor* out) {
+  PADDLE_ENFORCE_NE(src_layout,
+                    dst_layout,
+                    errors::PreconditionNotMet(
+                        "No layout transform needed between same layout."));
+  VLOG(10) << "TransDataLayout from " << static_cast<DataLayout>(src_layout)
+           << " -> " << static_cast<DataLayout>(dst_layout);
+
+#ifdef PADDLE_WITH_MKLDNN
+  TransferLayoutMKLDNN<Context>(dev_ctx,
+                                x,
+                                static_cast<DataLayout>(src_layout),
+                                static_cast<DataLayout>(dst_layout),
+                                out);
+#else
+  TransferLayoutGeneral<Context>(
+      dev_ctx, x, static_cast<DataLayout>(dst_layout), out);
+#endif
+}
+
 }  // namespace phi
 
-PD_REGISTER_GENERAL_KERNEL(phi_transfer_layout,
+PD_REGISTER_GENERAL_KERNEL(transfer_layout,
                            CPU,
                            ALL_LAYOUT,
                            phi::TransferLayoutKernel<phi::CPUContext>,
diff --git a/paddle/phi/kernels/transfer_layout_kernel.h b/paddle/phi/kernels/transfer_layout_kernel.h
index 3777daf07de71..73e12927d7ffe 100644
--- a/paddle/phi/kernels/transfer_layout_kernel.h
+++ b/paddle/phi/kernels/transfer_layout_kernel.h
@@ -23,7 +23,8 @@ namespace phi {
 template <typename Context>
 void TransferLayoutKernel(const Context& dev_ctx,
                           const DenseTensor& x,
-                          DataLayout dst_layout,
+                          int src_layout,
+                          int dst_layout,
                           DenseTensor* out);
 
 template <typename Context>
@@ -32,7 +33,11 @@ DenseTensor TransferLayout(const Context& dev_ctx,
                            DataLayout dst_layout) {
   phi::DenseTensor dense_out =
       phi::Empty(dev_ctx, {x.dtype(), x.dims(), dst_layout});
-  TransferLayoutKernel<Context>(dev_ctx, x, dst_layout, &dense_out);
+  TransferLayoutKernel<Context>(dev_ctx,
+                                x,
+                                static_cast<int>(x.layout()),
+                                static_cast<int>(dst_layout),
+                                &dense_out);
   return dense_out;
 }
 
diff --git a/paddle/phi/kernels/xpu/conv_transpose_grad_kernel.cc b/paddle/phi/kernels/xpu/conv_transpose_grad_kernel.cc
new file mode 100644
index 0000000000000..49061069b8cba
--- /dev/null
+++ b/paddle/phi/kernels/xpu/conv_transpose_grad_kernel.cc
@@ -0,0 +1,106 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/conv_transpose_grad_kernel.h"
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+
+namespace phi {
+template <typename T, typename Context>
+void Conv2dTransposeGradKernel(const Context& ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& filter,
+                               const DenseTensor& dout,
+                               const std::vector<int>& strides,
+                               const std::vector<int>& paddings,
+                               const std::vector<int>& output_padding,
+                               const std::vector<int>& output_size,
+                               const std::string& padding_algorithm,
+                               int groups,
+                               const std::vector<int>& dilations,
+                               const std::string& data_format,
+                               DenseTensor* dx,
+                               DenseTensor* dfilter) {
+  // The filter and dfilter will be reshaped in the calculations,
+  // so here use an assignment operation,
+  // that avoids modifying the variable in the Scope.
+  DenseTensor filter_ = filter;
+  if (!dx && !dfilter) return;
+
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> dilations_ = dilations;
+
+  PADDLE_ENFORCE_EQ(
+      data_format == "NHWC" || data_format == "NDHWC",
+      false,
+      errors::InvalidArgument(
+          ("XPU do support data_format is NCHW in conv grad op.")));
+
+  DDim in_data_dims = slice_ddim(x.dims(), 2, x.dims().size());
+  DDim filter_data_dims = slice_ddim(filter_.dims(), 2, filter_.dims().size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings_, &dilations_, padding_algorithm, in_data_dims, strides, ksize);
+
+  const int batch_size = static_cast<int>(x.dims()[0]);
+  const int img_yc = static_cast<int>(x.dims()[1]);
+  const int img_yh = static_cast<int>(x.dims()[2]);
+  const int img_yw = static_cast<int>(x.dims()[3]);
+  const int img_xc = static_cast<int>(dout.dims()[1]);
+  const int img_xh = static_cast<int>(dout.dims()[2]);
+  const int img_xw = static_cast<int>(dout.dims()[3]);
+  if (dx) {
+    ctx.template Alloc<T>(dx);
+  }
+  if (dfilter) {
+    ctx.template Alloc<T>(dfilter);
+  }
+
+  int r = xpu::conv2d_transpose_grad<float, float, float, int16_t>(
+      ctx.x_context(),
+      x.data<T>(),
+      filter_.data<T>(),
+      dout.data<T>(),
+      dx ? dx->data<T>() : nullptr,
+      dfilter ? dfilter->data<T>() : nullptr,
+      batch_size,
+      img_yc,
+      img_yh,
+      img_yw,
+      img_xc,
+      img_xh,
+      img_xw,
+      ksize,
+      strides,
+      paddings_,
+      dilations_,
+      groups,
+      nullptr,
+      nullptr,
+      nullptr,
+      nullptr,
+      nullptr,
+      true);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv2d_transpose_grad");
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(conv2d_transpose_grad,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::Conv2dTransposeGradKernel,
+                   float) {}
diff --git a/paddle/phi/kernels/xpu/conv_transpose_kernel.cc b/paddle/phi/kernels/xpu/conv_transpose_kernel.cc
new file mode 100644
index 0000000000000..3fcd4b4a32645
--- /dev/null
+++ b/paddle/phi/kernels/xpu/conv_transpose_kernel.cc
@@ -0,0 +1,126 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/conv_transpose_kernel.h"
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+
+namespace phi {
+
+// target_len == 2 || target_len == 4
+inline std::vector<int> vector_extend(const std::vector<int>& src,
+                                      int target_len) {
+  if (target_len == 2 && src.size() == 1) {
+    return {src[0], src[0]};
+  }
+  if (target_len == 4 && src.size() == 1) {
+    return {src[0], src[0], src[0], src[0]};
+  }
+  if (target_len == 4 && src.size() == 2) {
+    return {src[0], src[0], src[1], src[1]};
+  }
+  return src;
+}
+
+template <typename T, typename Context>
+void Conv2dTransposeKernel(const Context& ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& filter,
+                           const std::vector<int>& strides,
+                           const std::vector<int>& paddings,
+                           const std::vector<int>& output_padding,
+                           const std::vector<int>& output_size,
+                           const std::string& padding_algorithm,
+                           int groups,
+                           const std::vector<int>& dilations,
+                           const std::string& data_format,
+                           DenseTensor* out) {
+  // The filter will be reshaped in the calculations,
+  // so here use an assignment operation,
+  // that avoids modifying the variable in the Scope.
+  DenseTensor filter_ = filter;
+
+  ctx.template Alloc<T>(out);
+
+  PADDLE_ENFORCE_EQ(
+      data_format == "NHWC" || data_format == "NDHWC",
+      false,
+      errors::InvalidArgument(
+          ("XPU do support data_format is NCHW in conv_transpose op.")));
+
+  DDim in_data_dims = slice_ddim(x.dims(), 2, x.dims().size());
+  DDim filter_data_dims = slice_ddim(filter_.dims(), 2, filter_.dims().size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> dilations_ = dilations;
+  UpdatePaddingAndDilation(
+      &paddings_, &dilations_, padding_algorithm, in_data_dims, strides, ksize);
+
+  const int batch_size = static_cast<int>(x.dims()[0]);
+  const int img_yc = static_cast<int>(x.dims()[1]);
+  const int img_yh = static_cast<int>(x.dims()[2]);
+  const int img_yw = static_cast<int>(x.dims()[3]);
+  const int img_xc = static_cast<int>(out->dims()[1]);
+  const int img_xh = static_cast<int>(out->dims()[2]);
+  const int img_xw = static_cast<int>(out->dims()[3]);
+
+  {
+    std::vector<int> ksize_check = vector_extend(ksize, 2);
+    std::vector<int> stride_check = vector_extend(strides, 2);
+    std::vector<int> pad_check = vector_extend(paddings_, 4);
+    std::vector<int> dilation_check = vector_extend(dilations_, 2);
+
+    int xh_check = (img_yh - 1) * stride_check[0] - pad_check[0] -
+                   pad_check[1] +
+                   (dilation_check[0] * (ksize_check[0] - 1) + 1);
+    int xw_check = (img_yw - 1) * stride_check[1] - pad_check[2] -
+                   pad_check[3] +
+                   (dilation_check[1] * (ksize_check[1] - 1) + 1);
+
+    PADDLE_ENFORCE_EQ(
+        xh_check == img_xh && xw_check == img_xw,
+        true,
+        errors::InvalidArgument(
+            ("XPU output size check error in conv_transpose op.")));
+  }
+
+  int r =
+      xpu::conv2d_transpose<float, float, float, int16_t>(ctx.x_context(),
+                                                          x.data<float>(),
+                                                          filter_.data<float>(),
+                                                          out->data<float>(),
+                                                          batch_size,
+                                                          img_yc,
+                                                          img_yh,
+                                                          img_yw,
+                                                          img_xc,
+                                                          ksize,
+                                                          strides,
+                                                          paddings_,
+                                                          dilations_,
+                                                          groups,
+                                                          nullptr,
+                                                          nullptr,
+                                                          nullptr,
+                                                          true);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv2d_transpose");
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    conv2d_transpose, XPU, ALL_LAYOUT, phi::Conv2dTransposeKernel, float) {}
diff --git a/paddle/phi/kernels/xpu/grid_sample_kernel.cc b/paddle/phi/kernels/xpu/grid_sample_kernel.cc
new file mode 100644
index 0000000000000..abe5a17c62509
--- /dev/null
+++ b/paddle/phi/kernels/xpu/grid_sample_kernel.cc
@@ -0,0 +1,113 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/grid_sample_kernel.h"
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GridSampleKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& grid,
+                      const std::string& mode,
+                      const std::string& padding_mode,
+                      bool align_corners,
+                      DenseTensor* out) {
+  int n = x.dims()[0];
+  int c = x.dims()[1];
+  int h = x.dims()[2];
+  int w = x.dims()[3];
+  int out_h = grid.dims()[1];
+  int out_w = grid.dims()[2];
+
+  // attrs
+  // paddle.nn.functional.grid_sample(x, grid, mode='bilinear',
+  // padding_mode='zeros', align_corners=True, name=None)
+  const std::string data_format =
+      paddle::framework::DataLayoutToString(x.layout());
+
+  // attr to real param
+  bool is_nearest_bool;
+  if (mode == "bilinear") {
+    is_nearest_bool = false;
+  } else if (mode == "nearest") {
+    is_nearest_bool = true;
+  } else {
+    PADDLE_THROW(errors::InvalidArgument(
+        "should not reach here: mode should be either 'bilinear' or "
+        "'nearest', bot got %s.",
+        mode));
+  }
+
+  // attention: 0: zeros, 2: reflection, 1: border according to XDNN api.
+  int padding_mode_int;
+  if (padding_mode == "zeros") {
+    padding_mode_int = 0;
+  } else if (padding_mode == "reflection") {
+    padding_mode_int = 2;
+  } else if (padding_mode == "border") {
+    padding_mode_int = 1;
+  } else {
+    PADDLE_THROW(errors::InvalidArgument(
+        "should not reach here: padding_mode should be either 'zeros' or "
+        "'reflection' or 'border', bot got %s.",
+        padding_mode));
+  }
+
+  bool is_nchw_bool;
+  if (data_format == "NCHW") {
+    is_nchw_bool = true;
+  } else if (data_format == "NHWC") {
+    is_nchw_bool = false;
+  } else {
+    PADDLE_THROW(errors::InvalidArgument(
+        "should not reach here: data_format should be either 'NCHW' or "
+        "'NHWC', bot got %s.",
+        data_format));
+  }
+
+  // data pointers
+  const T* input_data = x.data<T>();
+  const T* grid_data = grid.data<T>();
+  out->Resize(make_ddim({n, c, out_h, out_w}));
+  T* output_data = dev_ctx.template Alloc<T>(out);
+
+  // int grid_sample(Context* ctx, const T* x, const T* grid, T* y, int n, int
+  // c, int xh, int xw, int yh, int yw, bool is_nearest, bool align_corners,
+  // int padding_mode, bool is_nchw);
+  int r = xpu::grid_sample(dev_ctx.x_context(),
+                           input_data,
+                           grid_data,
+                           output_data,
+                           n,
+                           c,
+                           h,
+                           w,
+                           out_h,
+                           out_w,
+                           is_nearest_bool,
+                           align_corners,
+                           padding_mode_int,
+                           is_nchw_bool);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "grid_sampler");
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(grid_sample, XPU, ALL_LAYOUT, phi::GridSampleKernel, float) {
+}
diff --git a/paddle/phi/tests/kernels/CMakeLists.txt b/paddle/phi/tests/kernels/CMakeLists.txt
index 152bc0dd0c060..d1c9d25483fec 100644
--- a/paddle/phi/tests/kernels/CMakeLists.txt
+++ b/paddle/phi/tests/kernels/CMakeLists.txt
@@ -134,3 +134,8 @@ cc_test(
   test_memcpy_dev_api
   SRCS test_memcpy_dev_api.cc
   DEPS phi phi_api_utils)
+
+cc_test(
+  test_transfer_layout_dev_api
+  SRCS test_transfer_layout_dev_api.cc
+  DEPS phi phi_api_utils)
diff --git a/paddle/phi/tests/kernels/test_transfer_layout_dev_api.cc b/paddle/phi/tests/kernels/test_transfer_layout_dev_api.cc
new file mode 100644
index 0000000000000..0c81ecada96e1
--- /dev/null
+++ b/paddle/phi/tests/kernels/test_transfer_layout_dev_api.cc
@@ -0,0 +1,74 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <memory>
+
+#include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/infermeta/unary.h"
+#include "paddle/phi/kernels/transfer_layout_kernel.h"
+
+namespace phi {
+namespace tests {
+
+#ifdef PADDLE_WITH_MKLDNN
+TEST(DEV_API, transfer_layout) {
+  // 1. create tensor
+
+  const int n = 2;
+  const int c = 3;
+  const int h = 4;
+  const int w = 5;
+
+  DenseTensor x;
+  MetaTensor meta_x(&x);
+  meta_x.set_dtype(DataType::FLOAT32);
+  meta_x.set_layout(DataLayout::MKLDNN);
+  meta_x.set_dims(make_ddim({n, c, h, w}));
+
+  DenseTensor out;
+
+  // 2. test API
+  auto& pool = phi::DeviceContextPool::Instance();
+  auto place = phi::CPUPlace();
+  auto* dev_ctx = static_cast<const phi::CPUContext*>(pool.GetByPlace(place));
+
+  MetaTensor meta_out(&out);
+  TransferLayoutInferMeta(x,
+                          static_cast<int>(x.layout()),
+                          static_cast<int>(DataLayout::NHWC),
+                          &meta_out);
+  TransferLayoutKernel<CPUContext>(*dev_ctx,
+                                   x,
+                                   static_cast<int>(x.layout()),
+                                   static_cast<int>(DataLayout::NHWC),
+                                   &out);
+
+  // 3. check result
+  std::vector<int64_t> expect_shape = {12, 3};
+  ASSERT_EQ(out.dims(), make_ddim({n, h, w, c}));
+  ASSERT_EQ(out.dims().size(), 4);
+  ASSERT_EQ(out.meta().dtype, DataType::FLOAT32);
+  ASSERT_EQ(out.meta().layout, DataLayout::NHWC);
+}
+
+#endif
+}  // namespace tests
+}  // namespace phi
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index eeac7e5f12c0c..4bd9bfb670fc1 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -43,10 +43,10 @@ taskkill /f /im cicc.exe /t 2>NUL
 taskkill /f /im ptxas.exe /t 2>NUL
 taskkill /f /im op_function_generator.exe /t 2>NUL
 taskkill /f /im eager_generator.exe /t 2>NUL
-taskkill /f /im eager_op_function_generator.exe /t 2>NUL
+taskkill /f /im eager_legacy_op_function_generator.exe /t 2>NUL
 wmic process where name="op_function_generator.exe" call terminate 2>NUL
 wmic process where name="eager_generator.exe" call terminate 2>NUL
-wmic process where name="eager_op_function_generator.exe" call terminate 2>NUL
+wmic process where name="eager_legacy_op_function_generator.exe" call terminate 2>NUL
 wmic process where name="cvtres.exe" call terminate 2>NUL
 wmic process where name="rc.exe" call terminate 2>NUL
 wmic process where name="cl.exe" call terminate 2>NUL
@@ -531,10 +531,10 @@ taskkill /f /im cicc.exe /t 2>NUL
 taskkill /f /im ptxas.exe /t 2>NUL
 taskkill /f /im op_function_generator.exe /t 2>NUL
 taskkill /f /im eager_generator.exe /t 2>NUL
-taskkill /f /im eager_op_function_generator.exe /t 2>NUL
+taskkill /f /im eager_legacy_op_function_generator.exe /t 2>NUL
 wmic process where name="op_function_generator.exe" call terminate 2>NUL
 wmic process where name="eager_generator.exe" call terminate 2>NUL
-wmic process where name="eager_op_function_generator.exe" call terminate 2>NUL
+wmic process where name="eager_legacy_op_function_generator.exe" call terminate 2>NUL
 wmic process where name="cmake.exe" call terminate 2>NUL
 wmic process where name="cvtres.exe" call terminate 2>NUL
 wmic process where name="rc.exe" call terminate 2>NUL
@@ -933,10 +933,10 @@ taskkill /f /im cicc.exe /t 2>NUL
 taskkill /f /im ptxas.exe /t 2>NUL
 taskkill /f /im op_function_generator.exe /t 2>NUL
 taskkill /f /im eager_generator.exe /t 2>NUL
-taskkill /f /im eager_op_function_generator.exe /t 2>NUL
+taskkill /f /im eager_legacy_op_function_generator.exe /t 2>NUL
 wmic process where name="op_function_generator.exe" call terminate 2>NUL
 wmic process where name="eager_generator.exe" call terminate 2>NUL
-wmic process where name="eager_op_function_generator.exe" call terminate 2>NUL
+wmic process where name="eager_legacy_op_function_generator.exe" call terminate 2>NUL
 wmic process where name="cvtres.exe" call terminate 2>NUL
 wmic process where name="rc.exe" call terminate 2>NUL
 wmic process where name="cl.exe" call terminate 2>NUL
diff --git a/python/paddle/_C_ops.py b/python/paddle/_C_ops.py
index e8f89c739c953..aa501051e734c 100644
--- a/python/paddle/_C_ops.py
+++ b/python/paddle/_C_ops.py
@@ -17,39 +17,6 @@
 
 __all__ = []
 
-_already_switch_to_eager_ = False
-
-if not framework._in_eager_mode_:
-    for name in dir(core.ops):
-        globals()[name] = getattr(core.ops, name)
-        __all__.append(name)
-    _already_switch_to_eager_ = False
-else:
-    for name in dir(core.eager.ops):
-        globals()[name] = getattr(core.eager.ops, name)
-        __all__.append(name)
-    _already_switch_to_eager_ = True
-
-
-def switch_to_core_ops():
-    global _already_switch_to_eager_
-    if _already_switch_to_eager_:
-        for name in dir(core.eager.ops):
-            del globals()[name]
-            __all__.remove(name)
-        for name in dir(core.ops):
-            globals()[name] = getattr(core.ops, name)
-            __all__.append(name)
-        _already_switch_to_eager_ = False
-
-
-def switch_to_eager_ops():
-    global _already_switch_to_eager_
-    if not _already_switch_to_eager_:
-        for name in dir(core.ops):
-            del globals()[name]
-            __all__.remove(name)
-        for name in dir(core.eager.ops):
-            globals()[name] = getattr(core.eager.ops, name)
-            __all__.append(name)
-        _already_switch_to_eager_ = True
+for name in dir(core.eager.ops):
+    globals()[name] = getattr(core.eager.ops, name)
+    __all__.append(name)
diff --git a/python/paddle/_legacy_C_ops.py b/python/paddle/_legacy_C_ops.py
new file mode 100644
index 0000000000000..ace90e62edfb8
--- /dev/null
+++ b/python/paddle/_legacy_C_ops.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid import core
+from .fluid import framework
+
+__all__ = []
+
+_already_switch_to_eager_ = False
+
+if not framework._in_eager_mode_:
+    for name in dir(core.ops):
+        globals()[name] = getattr(core.ops, name)
+        __all__.append(name)
+    _already_switch_to_eager_ = False
+else:
+    for name in dir(core.eager.ops.legacy):
+        globals()[name] = getattr(core.eager.ops.legacy, name)
+        __all__.append(name)
+    _already_switch_to_eager_ = True
+
+
+def switch_to_core_ops():
+    global _already_switch_to_eager_
+    if _already_switch_to_eager_:
+        for name in dir(core.eager.ops.legacy):
+            del globals()[name]
+            __all__.remove(name)
+        for name in dir(core.ops):
+            globals()[name] = getattr(core.ops, name)
+            __all__.append(name)
+        _already_switch_to_eager_ = False
+
+
+def switch_to_eager_ops():
+    global _already_switch_to_eager_
+    if not _already_switch_to_eager_:
+        for name in dir(core.ops):
+            del globals()[name]
+            __all__.remove(name)
+        for name in dir(core.eager.ops.legacy):
+            globals()[name] = getattr(core.eager.ops.legacy, name)
+            __all__.append(name)
+        _already_switch_to_eager_ = True
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index a4e918fd01f59..0879467b72eb7 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -37,7 +37,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 import paddle.fluid.dygraph_utils as dygraph_utils
 import contextlib
 
@@ -355,7 +355,7 @@ def barrier(group=None):
 
     temp = fill_constant([1], dtype="int32", value="1")
     if _non_static_mode():
-        return _C_ops.barrier(temp, temp, 'ring_id', ring_id)
+        return _legacy_C_ops.barrier(temp, temp, 'ring_id', ring_id)
 
     op_type = 'barrier'
 
@@ -657,7 +657,7 @@ def wait(tensor, group=None, use_calc_stream=True):
 def _sync_calc_stream(tensor):
 
     if _non_static_mode():
-        return _C_ops.c_sync_calc_stream(tensor, tensor)
+        return _legacy_C_ops.c_sync_calc_stream(tensor, tensor)
 
     op_type = 'c_sync_calc_stream'
 
@@ -672,7 +672,8 @@ def _sync_calc_stream(tensor):
 def _sync_comm_stream(tensor, ring_id=0):
 
     if _non_static_mode():
-        return _C_ops.c_sync_comm_stream([tensor], [tensor], 'ring_id', ring_id)
+        return _legacy_C_ops.c_sync_comm_stream([tensor], [tensor], 'ring_id',
+                                                ring_id)
 
     op_type = 'c_sync_comm_stream'
 
@@ -750,9 +751,9 @@ def broadcast(tensor, src, group=None, use_calc_stream=True):
     assert gsrc >= 0, ("src rank out of group, need global rank")
 
     if _non_static_mode():
-        return _C_ops.c_broadcast(tensor, tensor, 'root', gsrc,
-                                  'use_calc_stream', use_calc_stream, 'ring_id',
-                                  ring_id)
+        return _legacy_C_ops.c_broadcast(tensor, tensor, 'root', gsrc,
+                                         'use_calc_stream', use_calc_stream,
+                                         'ring_id', ring_id)
 
     op_type = 'c_broadcast'
     check_variable_and_dtype(
@@ -830,17 +831,21 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=None, use_calc_stream=True):
     ring_id = 0 if group is None else group.id
     if _non_static_mode():
         if op == ReduceOp.SUM:
-            return _C_ops.c_allreduce_sum_(tensor, 'use_calc_stream',
-                                           use_calc_stream, 'ring_id', ring_id)
+            return _legacy_C_ops.c_allreduce_sum_(tensor, 'use_calc_stream',
+                                                  use_calc_stream, 'ring_id',
+                                                  ring_id)
         elif op == ReduceOp.MAX:
-            return _C_ops.c_allreduce_max_(tensor, 'use_calc_stream',
-                                           use_calc_stream, 'ring_id', ring_id)
+            return _legacy_C_ops.c_allreduce_max_(tensor, 'use_calc_stream',
+                                                  use_calc_stream, 'ring_id',
+                                                  ring_id)
         elif op == ReduceOp.MIN:
-            return _C_ops.c_allreduce_min_(tensor, 'use_calc_stream',
-                                           use_calc_stream, 'ring_id', ring_id)
+            return _legacy_C_ops.c_allreduce_min_(tensor, 'use_calc_stream',
+                                                  use_calc_stream, 'ring_id',
+                                                  ring_id)
         elif op == ReduceOp.PROD:
-            return _C_ops.c_allreduce_prod_(tensor, 'use_calc_stream',
-                                            use_calc_stream, 'ring_id', ring_id)
+            return _legacy_C_ops.c_allreduce_prod_(tensor, 'use_calc_stream',
+                                                   use_calc_stream, 'ring_id',
+                                                   ring_id)
         else:
             raise ValueError("Unknown parameter: {}.".format(op))
 
@@ -931,21 +936,22 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=None, use_calc_stream=True):
 
     if _non_static_mode():
         if op == ReduceOp.SUM:
-            return _C_ops.c_reduce_sum(tensor, tensor, 'use_calc_stream',
-                                       use_calc_stream, 'ring_id', ring_id,
-                                       'root_id', gdst)
+            return _legacy_C_ops.c_reduce_sum(tensor, tensor, 'use_calc_stream',
+                                              use_calc_stream, 'ring_id',
+                                              ring_id, 'root_id', gdst)
         elif op == ReduceOp.MAX:
-            return _C_ops.c_reduce_max(tensor, tensor, 'use_calc_stream',
-                                       use_calc_stream, 'ring_id', ring_id,
-                                       'root_id', gdst)
+            return _legacy_C_ops.c_reduce_max(tensor, tensor, 'use_calc_stream',
+                                              use_calc_stream, 'ring_id',
+                                              ring_id, 'root_id', gdst)
         elif op == ReduceOp.MIN:
-            return _C_ops.c_reduce_min(tensor, tensor, 'use_calc_stream',
-                                       use_calc_stream, 'ring_id', ring_id,
-                                       'root_id', gdst)
+            return _legacy_C_ops.c_reduce_min(tensor, tensor, 'use_calc_stream',
+                                              use_calc_stream, 'ring_id',
+                                              ring_id, 'root_id', gdst)
         elif op == ReduceOp.PROD:
-            return _C_ops.c_reduce_prod(tensor, tensor, 'use_calc_stream',
-                                        use_calc_stream, 'ring_id', ring_id,
-                                        'root_id', gdst)
+            return _legacy_C_ops.c_reduce_prod(tensor, tensor,
+                                               'use_calc_stream',
+                                               use_calc_stream, 'ring_id',
+                                               ring_id, 'root_id', gdst)
         else:
             raise ValueError("Unknown parameter: {}.".format(op))
 
@@ -1052,8 +1058,9 @@ def convert_to_complex(list_of_tensor):
     nranks = _get_global_group().nranks if group is None else group.nranks
 
     if _non_static_mode():
-        out = _C_ops.c_allgather(tensor, 'use_calc_stream', use_calc_stream,
-                                 'ring_id', ring_id, 'nranks', nranks)
+        out = _legacy_C_ops.c_allgather(tensor, 'use_calc_stream',
+                                        use_calc_stream, 'ring_id', ring_id,
+                                        'nranks', nranks)
     else:
         op_type = 'c_allgather'
         helper = LayerHelper(op_type, **locals())
@@ -1237,9 +1244,9 @@ def scatter(tensor, tensor_list=None, src=0, group=None, use_calc_stream=True):
             return task
 
     if _non_static_mode():
-        return _C_ops.c_scatter(temp, tensor, 'use_calc_stream',
-                                use_calc_stream, 'ring_id', ring_id, 'nranks',
-                                nranks, 'root', gsrc)
+        return _legacy_C_ops.c_scatter(temp, tensor, 'use_calc_stream',
+                                       use_calc_stream, 'ring_id', ring_id,
+                                       'nranks', nranks, 'root', gsrc)
     op_type = 'c_scatter'
     check_variable_and_dtype(
         tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
@@ -1273,8 +1280,9 @@ def _c_identity(tensor, group=None):
     ring_id = 0 if group is None else group.id
 
     if _non_static_mode():
-        return _C_ops.c_identity(tensor, 'use_calc_stream', True, 'ring_id',
-                                 ring_id, 'use_model_parallel', True)
+        return _legacy_C_ops.c_identity(tensor, 'use_calc_stream', True,
+                                        'ring_id', ring_id,
+                                        'use_model_parallel', True)
     op_type = 'c_identity'
     helper = LayerHelper(op_type, **locals())
     out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
@@ -1316,9 +1324,10 @@ def _c_concat(tensor, group=None):
     nranks = group.nranks
 
     if _non_static_mode():
-        return _C_ops.c_concat(tensor, 'ring_id', ring_id, 'use_calc_stream',
-                               True, 'rank', rank, 'nranks', nranks,
-                               'use_model_parallel', True)
+        return _legacy_C_ops.c_concat(tensor, 'ring_id', ring_id,
+                                      'use_calc_stream', True, 'rank', rank,
+                                      'nranks', nranks, 'use_model_parallel',
+                                      True)
 
     op_type = 'c_concat'
     helper = LayerHelper(op_type, **locals())
@@ -1363,9 +1372,9 @@ def _c_split(tensor, group=None):
     nranks = _get_global_env().world_size if group is None else group.nranks
 
     if _non_static_mode():
-        return _C_ops.c_split(tensor, 'use_calc_stream', True, 'ring_id',
-                              ring_id, 'rank', rank, 'nranks', nranks,
-                              'use_model_parallel', True)
+        return _legacy_C_ops.c_split(tensor, 'use_calc_stream', True, 'ring_id',
+                                     ring_id, 'rank', rank, 'nranks', nranks,
+                                     'use_model_parallel', True)
 
     op_type = 'c_split'
     helper = LayerHelper(op_type, **locals())
@@ -1410,26 +1419,27 @@ class mp_allreduce_eager(PyLayer):
             def forward(ctx, tensor, use_calc_stream, ring_id,
                         use_model_parallel):
                 ctx.ring_id = ring_id
-                return _C_ops.c_allreduce_sum_(tensor, 'use_calc_stream',
-                                               use_calc_stream, 'ring_id',
-                                               ring_id, "use_model_parallel",
-                                               use_model_parallel)
+                return _legacy_C_ops.c_allreduce_sum_(tensor, 'use_calc_stream',
+                                                      use_calc_stream,
+                                                      'ring_id', ring_id,
+                                                      "use_model_parallel",
+                                                      use_model_parallel)
 
             @staticmethod
             def backward(ctx, dy):
-                return _C_ops.c_identity(dy, 'use_calc_stream', True, 'ring_id',
-                                         ctx.ring_id, 'use_model_parallel',
-                                         True)
+                return _legacy_C_ops.c_identity(dy, 'use_calc_stream', True,
+                                                'ring_id', ctx.ring_id,
+                                                'use_model_parallel', True)
 
         return mp_allreduce_eager.apply(tensor, use_calc_stream, ring_id,
                                         use_model_parallel)
 
     elif _in_legacy_dygraph():
         if op == ReduceOp.SUM:
-            return _C_ops.c_allreduce_sum_(tensor, 'use_calc_stream',
-                                           use_calc_stream, 'ring_id', ring_id,
-                                           "use_model_parallel",
-                                           use_model_parallel)
+            return _legacy_C_ops.c_allreduce_sum_(tensor, 'use_calc_stream',
+                                                  use_calc_stream, 'ring_id',
+                                                  ring_id, "use_model_parallel",
+                                                  use_model_parallel)
         else:
             raise ValueError("Unknown parameter: {}.".format(op))
 
@@ -1467,7 +1477,8 @@ def _c_lookup_table(table, index, start_index=0, name=None):
         Tensor.
     """
     if _non_static_mode():
-        return _C_ops.c_embedding(table, index, "start_index", start_index)
+        return _legacy_C_ops.c_embedding(table, index, "start_index",
+                                         start_index)
 
     op_type = 'c_embedding'
     helper = LayerHelper(op_type, **locals())
@@ -1543,7 +1554,7 @@ def _c_softmax_with_cross_entropy(logits,
         label = paddle.unsqueeze(label, axis=-1)
 
     if _non_static_mode():
-        softmax, loss = _C_ops.c_softmax_with_cross_entropy(
+        softmax, loss = _legacy_C_ops.c_softmax_with_cross_entropy(
             logits, label, 'ring_id', ring_id, 'rank', rank, 'nranks', nranks)
         if not return_softmax:
             return loss
@@ -1581,8 +1592,8 @@ def _linear(x, weight, bias=None, name=None):
     """
     if _non_static_mode():
         pre_bias = _varbase_creator(dtype=x.dtype)
-        _C_ops.matmul(x, weight, pre_bias, 'transpose_X', False, 'transpose_Y',
-                      False, "alpha", 1)
+        _legacy_C_ops.matmul(x, weight, pre_bias, 'transpose_X', False,
+                             'transpose_Y', False, "alpha", 1)
         return dygraph_utils._append_bias_in_dygraph(pre_bias,
                                                      bias,
                                                      axis=len(x.shape) - 1)
@@ -2056,8 +2067,8 @@ def alltoall(in_tensor_list, out_tensor_list, group=None, use_calc_stream=True):
         return
 
     if _non_static_mode():
-        out = _C_ops.alltoall(temp, 'use_calc_stream', use_calc_stream,
-                              'ring_id', ring_id)
+        out = _legacy_C_ops.alltoall(temp, 'use_calc_stream', use_calc_stream,
+                                     'ring_id', ring_id)
     else:
         op_type = 'alltoall'
         helper = LayerHelper(op_type, **locals())
@@ -2225,8 +2236,8 @@ def send(tensor, dst=0, group=None, use_calc_stream=True):
     ring_id = 0 if group is None else group.id
 
     if _non_static_mode():
-        return _C_ops.send_v2(tensor, 'use_calc_stream', use_calc_stream,
-                              'ring_id', ring_id, 'peer', dst)
+        return _legacy_C_ops.send_v2(tensor, 'use_calc_stream', use_calc_stream,
+                                     'ring_id', ring_id, 'peer', dst)
     op_type = 'send_v2'
     check_variable_and_dtype(
         tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
@@ -2288,9 +2299,9 @@ def recv(tensor, src=0, group=None, use_calc_stream=True):
     ring_id = 0 if group is None else group.id
 
     if _non_static_mode():
-        return _C_ops.recv_v2(tensor, 'use_calc_stream', use_calc_stream,
-                              'ring_id', ring_id, 'peer', src, 'dtype',
-                              tensor.dtype, 'out_shape', tensor.shape)
+        return _legacy_C_ops.recv_v2(tensor, 'use_calc_stream', use_calc_stream,
+                                     'ring_id', ring_id, 'peer', src, 'dtype',
+                                     tensor.dtype, 'out_shape', tensor.shape)
     op_type = 'recv_v2'
     check_variable_and_dtype(
         tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
diff --git a/python/paddle/distributed/fleet/fleet.py b/python/paddle/distributed/fleet/fleet.py
index 060d5defd4973..63e4c5ec1822b 100644
--- a/python/paddle/distributed/fleet/fleet.py
+++ b/python/paddle/distributed/fleet/fleet.py
@@ -30,7 +30,7 @@
 from paddle.fluid.ir import apply_build_strategy
 from .base import topology as tp
 from .meta_parallel import model_parallel_random_seed
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid import core
 
 __all__ = []
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
index 641bc25e5c59e..5e11760d913d0 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
@@ -22,7 +22,7 @@
 import types
 from paddle.fluid import core
 import paddle
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 __all__ = []
 
@@ -66,8 +66,8 @@ def _unscale(self, optimizer):
             param._grad_ivar() for param in optimizer._parameter_list
             if param._grad_ivar() is not None
         ]
-        _C_ops.check_finite_and_unscale(param_grads, self._scale, param_grads,
-                                        self._found_inf)
+        _legacy_C_ops.check_finite_and_unscale(param_grads, self._scale,
+                                               param_grads, self._found_inf)
         # allreduce_max found_inf in check_group
         if not self._use_dp_mode:
             self._found_inf = paddle.cast(self._found_inf, dtype="int32")
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
index 58b0515e0bac8..371a8b3e04121 100755
--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
@@ -91,11 +91,18 @@ def __init__(self,
 
 class SegmentLayers(object):
 
-    def __init__(self, layers_desc, num_parts, method="uniform"):
+    def __init__(self,
+                 layers_desc,
+                 num_parts,
+                 method="uniform",
+                 num_virtual_pipeline_stage=None):
         self._layers_desc = layers_desc
         self.method = method
         self.num_parts = num_parts
         self.num_items = len(layers_desc)
+        self.num_virtual_pipeline_stage = num_virtual_pipeline_stage
+        if self.num_virtual_pipeline_stage is not None:
+            self.total_parts = num_parts * self.num_virtual_pipeline_stage
         assert self.num_items >= self.num_parts, "layer number should be greater than number of segments"
 
     def do_segment(self):
@@ -110,12 +117,14 @@ def do_segment(self):
             for idx in weight_idxs:
                 weights[idx] = 1
 
+            actual_num_parts = self.num_parts if self.num_virtual_pipeline_stage is None else self.total_parts
+
             assert sum(
                 weights
-            ) % self.num_parts == 0, "number of layers ({}) should be divided by part number({})".format(
-                sum(weights), self.num_parts)
-            part_size = sum(weights) // self.num_parts
-            result = [0 for _ in range(self.num_parts + 1)]
+            ) % actual_num_parts == 0, "number of layers ({}) should be divided by part number({})".format(
+                sum(weights), actual_num_parts)
+            part_size = sum(weights) // actual_num_parts
+            result = [0 for _ in range(actual_num_parts + 1)]
 
             memory_counter = 0
             result_idx = 1
@@ -125,7 +134,7 @@ def do_segment(self):
                     result[result_idx] = idx + 1
                     result_idx += 1
                     memory_counter = 0
-            result[self.num_parts] = len(weights)
+            result[actual_num_parts] = len(weights)
             return result
 
     def _gen_layer_weight(self, layername):
@@ -159,6 +168,32 @@ def uniform(self, num_items, num_parts):
         return result
 
 
+class PipelineLayerChunk(Layer):
+
+    def __init__(self):
+        super(PipelineLayerChunk, self).__init__()
+        self.run_function = []
+
+    def append(self, sublayer):
+        # This method is used to unify codes in _build_layer_impl.
+        # For 1f1b scheduler, it will call append method of a List.
+        # For interleave scheduler, it will call append method of this class.
+        if isinstance(sublayer, Layer):
+            self.add_sublayer(str(len(self.run_function)), sublayer)
+        self.run_function.append(sublayer)
+
+    def get_run_function(self):
+        return self.run_function
+
+    def forward(self, *args, **kwargs):
+        # Users shouldn't call PipelineLayerChunk directly, since all logics relating with recompute
+        # are in the forward function of PipelineLayer. Any directly call will bring unexpected
+        # behavior under recompute circumstance.
+        raise NotImplementedError(
+            "The forward function of PipelineLayerChunk cannot be called directly. "
+            "Please call forward function of PipelineLayer.")
+
+
 class PipelineLayer(Layer):
 
     def __init__(self,
@@ -169,11 +204,26 @@ def __init__(self,
                  seg_method="uniform",
                  recompute_interval=0,
                  recompute_offload=False,
-                 recompute_partition=False):
+                 recompute_partition=False,
+                 num_virtual_pipeline_stages=None):
         super(PipelineLayer, self).__init__()
         if num_stages is None and topology is None:
             raise ValueError("should provide num_stages or topology")
 
+        if num_virtual_pipeline_stages:
+            assert isinstance(num_virtual_pipeline_stages, int), \
+                "virtual_pipeline_stage should be None or an int"
+            if num_virtual_pipeline_stages > 1:
+                logger.info(
+                    "set num_virtual_pipeline_stages > 1 means using interleave scheduler instead of 1f1b scheduler"
+                )
+                assert isinstance(seg_method, str), \
+                    "seg_method should be a str for interleave scheduler"
+                assert seg_method.startswith('layer:'), \
+                    "seg_method shoud be start with layer: for interleave scheduler"
+
+        self._num_virtual_pipeline_stages = 1 if num_virtual_pipeline_stages is None else num_virtual_pipeline_stages
+
         # lazy import
         import paddle.distributed as dist
         from paddle.distributed import fleet
@@ -214,28 +264,51 @@ def __init__(self,
             self._stage_id = self._topo.get_coord(self.global_rank).pipe
             self._num_stages = self._topo.get_dim_size("pipe")
 
+        self._total_stages_with_virtual_stages = self._num_stages * self._num_virtual_pipeline_stages
+
         # initialize segment
         self._layers_desc = list(self.layers)
         self._num_layers = len(self._layers_desc)
-        self._start_pos = 0
-        self._end_pos = self._num_layers - 1
-        self._segment_network(seg_method)
         self.shared_layers = paddle.nn.LayerDict()
         self.shared_weight_attrs = {}
 
-        # construct layer
-        self.run_function = []
-        self._build_layer()
+        if self._num_virtual_pipeline_stages > 1:
+            # interleaving pipeline segmentation
+            self._start_poss = []
+            self._end_poss = []
+            self._segment_network_for_interleave(seg_method)
+            # The _model_chunks is a list of PipelineLayerChunk,
+            # while PipelineLayerChunk is a list of Layers relating with one model chunk.
+            # Therefore, the _model_chunks is something like 'list of a list of layers'.
+            self._model_chunks = []
+            self._build_layer_with_interleave()
+        else:
+            # 1f1b pipeline segmentation
+            self._start_pos = 0
+            self._end_pos = self._num_layers - 1
+            self._segment_network(seg_method)
+            # construct layer
+            self.run_function = []
+            self._build_layer()
 
         self.shared_comm = self._construct_shared_comm()
         self._synchronize_shared_weights()
 
     def get_stage_from_index(self, layer_idx):
         assert 0 <= layer_idx < self._num_layers, "layer_idx is out of bound"
-        for stage in range(self._topo.get_dim('pipe')):
-            if self.segment_parts[stage] <= layer_idx < self.segment_parts[stage
-                                                                           + 1]:
-                return stage
+        for virtual_pp_rank in range(self._num_virtual_pipeline_stages):
+            # Mapping the virtual pipeline stage to the real pipeline stage.
+            # start_idx marks the start of a new virtual pp stage.
+            start_idx = virtual_pp_rank * self._num_virtual_pipeline_stages
+            for stage in range(self._num_stages):
+                # stage mark the real pp stage
+                if self.segment_parts[start_idx +
+                                      stage] <= layer_idx < self.segment_parts[
+                                          start_idx + stage + 1]:
+                    return stage
+
+    def get_model_chunks(self):
+        return None if self._num_virtual_pipeline_stages == 1 else self._model_chunks
 
     def _construct_shared_comm(self):
         shared_comm = {}
@@ -316,6 +389,33 @@ def allreduce_shared_weight_gradients(self):
                             'use_calc_stream': True
                         })
 
+    def _segment_network_for_interleave(self, seg_method):
+        logger.info("start segment network for interleave scheduler")
+        seg = SegmentLayers(
+            self._layers_desc,
+            num_parts=self._num_stages,
+            method=seg_method,
+            num_virtual_pipeline_stage=self._num_virtual_pipeline_stages)
+        self.segment_parts = seg.do_segment()
+
+        logger.info("segment result:" +
+                    ", ".join(str(arg) for arg in self.segment_parts))
+
+        for i in range(self._stage_id, self._total_stages_with_virtual_stages,
+                       self._num_virtual_pipeline_stages):
+            # If there are 2 real pp stages and 2 virtual pp stages, and the model has 8 layers.
+            # Layers [0, 1], [4, 5] will be assigned to the first real pp stage.
+            # Layers [2, 3], [6, 7] will be assigned to the second real pp stage.
+            # Layers [0, 1] and [2, 3] are the first virtual pp stage in each real pp stage.
+            # Layers [4, 5] and [6, 7] are the second virtual pp stage in each real pp stage.
+            assert self.segment_parts[i] <= self.segment_parts[i + 1]
+            self._start_poss.append(self.segment_parts[i])
+            self._end_poss.append(self.segment_parts[i + 1])
+
+        assert len(self._start_poss) == len(self._end_poss)
+
+        self._print_segmentation_for_debug()
+
     def _segment_network(self, seg_method):
         logger.info("start segment network..")
         seg = SegmentLayers(self._layers_desc,
@@ -328,9 +428,12 @@ def _segment_network(self, seg_method):
 
         self._start_pos = self.segment_parts[self._stage_id]
         self._end_pos = self.segment_parts[self._stage_id + 1]
+        self._print_segmentation_for_debug()
 
+    def _print_segmentation_for_debug(self):
         # print information for debug
-        for stage in range(self._num_stages):
+        for stage in range(self._num_stages *
+                           self._num_virtual_pipeline_stages):
             start = self.segment_parts[stage]
             end = self.segment_parts[stage + 1]
             logger.info("stage={}, global_rank={} ,layer_number={}".format(
@@ -339,20 +442,53 @@ def _segment_network(self, seg_method):
             for index, layer in enumerate(self._layers_desc[start:end]):
                 logger.info("{}: {}".format(index + start, str(layer)))
 
+        if self._num_virtual_pipeline_stages > 1:
+            for stage in range(self._num_stages):
+                stage_to_virtual_stage_info = "stage {} contains virtual stages: ".format(
+                    stage)
+                for i in range(stage, self._total_stages_with_virtual_stages,
+                               self._num_virtual_pipeline_stages):
+                    stage_to_virtual_stage_info += " {},".format(i)
+                logger.info(stage_to_virtual_stage_info)
+
         if self._loss_fn:
             try:
                 logger.info("loss: {}".format(self._loss_fn.__name__))
             except AttributeError:
                 logger.info("loss: {}".format(self._loss_fn.__class__.__name__))
 
+    def _build_layer_with_interleave(self):
+        for i in range(len(self._start_poss)):
+            start = self._start_poss[i]
+            end = self._end_poss[i]
+            # Get a model chunk
+            chunk = self._build_layer_impl(start, end)
+            assert isinstance(chunk, PipelineLayerChunk)
+            # Add the chunk to all chunks and add this chunk to the sublayer
+            self._model_chunks.append(chunk)
+            self.add_sublayer(str(start), chunk)
+
     def _build_layer(self):
         start = self._start_pos
         end = self._end_pos
+        self.run_function = self._build_layer_impl(start, end)
+
+    def _build_layer_impl(self, start, end):
+        if self._num_virtual_pipeline_stages > 1:
+            # For interleave scheduler, all layers relating with one model chunk will be saved in PipelineLayerChunk
+            run_function = PipelineLayerChunk()
+        else:
+            # For 1f1b scheduler, just use run_function list
+            run_function = self.run_function
+
         for index, layer in enumerate(self._layers_desc[start:end]):
             layer_index = start + index
             if isinstance(layer, Layer):
-                self.run_function.append(layer)
-                self.add_sublayer(str(layer_index), layer)
+                run_function.append(layer)
+                if self._num_virtual_pipeline_stages == 1:
+                    # Only add sublayer for 1f1b scheduler,
+                    # for interleave, PipelineLayerChunk will do this
+                    self.add_sublayer(str(layer_index), layer)
             elif isinstance(layer, SharedLayerDesc):
                 if layer.layer_name not in self.shared_layers:
                     self.shared_layers[layer.layer_name] = layer.build_layer()
@@ -363,20 +499,24 @@ def _build_layer(self):
                         setattr(param, "is_firstly_shared", True)
 
                 if layer.forward_func is None:
-                    self.run_function.append(
-                        self.shared_layers[layer.layer_name])
+                    run_function.append(self.shared_layers[layer.layer_name])
 
                 else:
-                    self.run_function.append(
+                    run_function.append(
                         partial(layer.forward_func,
                                 self.shared_layers[layer.layer_name]))
 
             elif isinstance(layer, LayerDesc):
                 model = layer.build_layer()
-                self.run_function.append(model)
-                self.add_sublayer(str(layer_index), model)
+                run_function.append(model)
+                if self._num_virtual_pipeline_stages == 1:
+                    # Only add sublayer for 1f1b scheduler,
+                    # for interleave, PipelineLayerChunk will do this
+                    self.add_sublayer(str(layer_index), model)
             else:
-                self.run_function.append(layer)
+                run_function.append(layer)
+
+        return run_function
 
     def forward_function(self, start, end):
 
@@ -389,7 +529,22 @@ def execute_func(*x):
 
         return execute_func
 
-    def forward(self, input):
+    def forward(self, input, chunk_id=None):
+        if chunk_id is not None:
+            assert isinstance(chunk_id, int), "chunk_id should be an int"
+            assert self._num_virtual_pipeline_stages > 1, \
+                "chunk_id is only valid when using virtual pipeline stage"
+            assert chunk_id < len(self._model_chunks), \
+                "The virtual pipeline only has {} chunks, " \
+                "but received chunk_id {}.".format(len(self._model_chunks), chunk_id)
+            # Get the target model chunk.
+            model_chunk = self._model_chunks[chunk_id]
+            # Update the self.run_function to the target run functions.
+            # Runs for 1f1b and interleave are similar, just handle all functions in self.run_function.
+            # The only different is that, for 1f1b, self.run_function has already been inited during build_layer.
+            # But for interleave, self.run_function will keep updating to the target functions at every run.
+            self.run_function = model_chunk.get_run_function()
+
         if self._recompute_interval == 0:
             input = self.forward_function(0, len(self.run_function))(input)
         else:
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py
index 55c6a3308b8c1..900c0f79798fc 100644
--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py
@@ -15,7 +15,7 @@
 import paddle
 import contextlib
 import numpy as np
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid import core
 from paddle.fluid.data_feeder import check_variable_and_dtype
 from paddle.fluid.framework import _non_static_mode, default_main_program, Variable
@@ -205,9 +205,10 @@ def dropout(x,
 
     # dygraph using tracker, doesn't need determinate seed
     if _non_static_mode():
-        out, mask = _C_ops.dropout(x, 'dropout_prob', p, 'is_test',
-                                   not training, 'fix_seed', False, 'seed', 0,
-                                   'dropout_implementation', mode)
+        out, mask = _legacy_C_ops.dropout(x, 'dropout_prob', p, 'is_test',
+                                          not training, 'fix_seed', False,
+                                          'seed', 0, 'dropout_implementation',
+                                          mode)
         return out
 
     seed = determinate_seed(rng_name)
diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
index f42752c5e8f1b..14a2aa844826a 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
@@ -16,7 +16,7 @@
 from .utils import paddle_2_number, number_2_dtype
 from ...utils.log_util import logger
 import numpy as np
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 import paddle.fluid.core as core
 from paddle.fluid.framework import _in_legacy_dygraph, _non_static_mode, in_dygraph_mode
 
@@ -166,9 +166,10 @@ def _is_valid_send_recv_partial(tensor, mp_degree):
 def _partial_send_op(tensor, group, use_calc_stream, ring_id, dst, nranks,
                      rank_id):
     if _in_legacy_dygraph():
-        return _C_ops.partial_send(tensor.detach(), 'use_calc_stream',
-                                   use_calc_stream, 'ring_id', ring_id, 'peer',
-                                   dst, 'num', nranks, 'id', rank_id)
+        return _legacy_C_ops.partial_send(tensor.detach(), 'use_calc_stream',
+                                          use_calc_stream, 'ring_id', ring_id,
+                                          'peer', dst, 'num', nranks, 'id',
+                                          rank_id)
     elif in_dygraph_mode():
         group = paddle.distributed.collective._get_default_group(
         ) if group is None else group
@@ -204,10 +205,11 @@ def send_partial(tensor,
 def _partial_recv_op(tensor, group, use_calc_stream, ring_id, src, nranks,
                      rank_id):
     if _in_legacy_dygraph():
-        return _C_ops.partial_recv(tensor.detach(), 'use_calc_stream',
-                                   use_calc_stream, 'ring_id', ring_id, 'peer',
-                                   src, 'num', nranks, 'id', rank_id, 'dtype',
-                                   tensor.dtype, 'out_shape', tensor.shape)
+        return _legacy_C_ops.partial_recv(tensor.detach(), 'use_calc_stream',
+                                          use_calc_stream, 'ring_id', ring_id,
+                                          'peer', src, 'num', nranks, 'id',
+                                          rank_id, 'dtype', tensor.dtype,
+                                          'out_shape', tensor.shape)
     elif in_dygraph_mode():
         group = paddle.distributed.collective._get_default_group(
         ) if group is None else group
@@ -243,9 +245,11 @@ def recv_partial(tensor,
 def _partial_allgather_op(tensor, group, use_calc_stream, ring_id, nranks,
                           rank_id):
     if _in_legacy_dygraph():
-        return _C_ops.partial_allgather_(tensor.detach(), 'use_calc_stream',
-                                         use_calc_stream, 'ring_id', ring_id,
-                                         'nranks', nranks, 'rank', rank_id)
+        return _legacy_C_ops.partial_allgather_(tensor.detach(),
+                                                'use_calc_stream',
+                                                use_calc_stream, 'ring_id',
+                                                ring_id, 'nranks', nranks,
+                                                'rank', rank_id)
     elif in_dygraph_mode():
         group = paddle.distributed.collective._get_default_group(
         ) if group is None else group
diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
index 46fe7e641733a..bb774b8a0e5f8 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
@@ -16,7 +16,7 @@
 
 import paddle
 from paddle.fluid import core
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.autograd import PyLayer
 from paddle.fluid import framework
 from ...utils.recompute import check_recompute_necessary, detach_variable, swith_rng_state_tracker
@@ -115,8 +115,8 @@ def _all_gather(tensor, group=None, use_calc_stream=True):
     ring_id = 0 if group is None else group.id
     nranks = paddle.distributed.collective._get_global_group(
     ).nranks if group is None else group.nranks
-    return _C_ops.c_allgather(tensor, 'use_calc_stream', use_calc_stream,
-                              'ring_id', ring_id, 'nranks', nranks)
+    return _legacy_C_ops.c_allgather(tensor, 'use_calc_stream', use_calc_stream,
+                                     'ring_id', ring_id, 'nranks', nranks)
 
 
 def _split_activation(tensor):
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
index fcbbadbe12159..8cff407363a3b 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
@@ -19,7 +19,7 @@
 from types import MethodType
 
 import paddle
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid import core
 from paddle.fluid import layers
 from paddle.fluid.dygraph import to_variable
@@ -209,13 +209,15 @@ def unscale_method(self, optimizer):
 
         with device_guard(dev_id, device):
             if len(param_grads_fp16):
-                _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
-                                                param_grads_fp16,
-                                                temp_found_inf_fp16)
+                _legacy_C_ops.check_finite_and_unscale(param_grads_fp16,
+                                                       self._scale,
+                                                       param_grads_fp16,
+                                                       temp_found_inf_fp16)
             if len(param_grads_fp32):
-                _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
-                                                param_grads_fp32,
-                                                temp_found_inf_fp32)
+                _legacy_C_ops.check_finite_and_unscale(param_grads_fp32,
+                                                       self._scale,
+                                                       param_grads_fp32,
+                                                       temp_found_inf_fp32)
 
         self._found_inf = 1 if temp_found_inf_fp16 or temp_found_inf_fp32 else 0
         is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32")
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
index 63e2b91b3d9bd..d21502bcc16b8 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
@@ -21,7 +21,7 @@
 from types import MethodType
 
 import paddle
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid import core
 from paddle.fluid import layers
 from paddle.fluid.dygraph import to_variable
@@ -210,13 +210,15 @@ def unscale_method(self, optimizer):
 
         with device_guard(dev_id, device):
             if len(param_grads_fp16):
-                _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
-                                                param_grads_fp16,
-                                                temp_found_inf_fp16)
+                _legacy_C_ops.check_finite_and_unscale(param_grads_fp16,
+                                                       self._scale,
+                                                       param_grads_fp16,
+                                                       temp_found_inf_fp16)
             if len(param_grads_fp32):
-                _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
-                                                param_grads_fp32,
-                                                temp_found_inf_fp32)
+                _legacy_C_ops.check_finite_and_unscale(param_grads_fp32,
+                                                       self._scale,
+                                                       param_grads_fp32,
+                                                       temp_found_inf_fp32)
 
         self._found_inf = 1 if temp_found_inf_fp16 or temp_found_inf_fp32 else 0
         is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32")
diff --git a/python/paddle/distributed/fleet/scaler.py b/python/paddle/distributed/fleet/scaler.py
index f9a35e246848f..1fcbaac34a56c 100644
--- a/python/paddle/distributed/fleet/scaler.py
+++ b/python/paddle/distributed/fleet/scaler.py
@@ -20,7 +20,7 @@
 from paddle.fluid import core
 from paddle.fluid.dygraph import to_variable
 import numpy as np
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 
 def distributed_scaler(scaler):
@@ -60,13 +60,15 @@ def unscale_method(self, optimizer):
         temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool_))
         temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool_))
         if len(param_grads_fp16):
-            _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
-                                            param_grads_fp16,
-                                            temp_found_inf_fp16)
+            _legacy_C_ops.check_finite_and_unscale(param_grads_fp16,
+                                                   self._scale,
+                                                   param_grads_fp16,
+                                                   temp_found_inf_fp16)
         if len(param_grads_fp32):
-            _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
-                                            param_grads_fp32,
-                                            temp_found_inf_fp32)
+            _legacy_C_ops.check_finite_and_unscale(param_grads_fp32,
+                                                   self._scale,
+                                                   param_grads_fp32,
+                                                   temp_found_inf_fp32)
 
         self._found_inf = 1 if temp_found_inf_fp16 or temp_found_inf_fp32 else 0
         is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32")
diff --git a/python/paddle/distributed/models/moe/utils.py b/python/paddle/distributed/models/moe/utils.py
index 3b955dd2a8d2f..cde6a8a97f0eb 100644
--- a/python/paddle/distributed/models/moe/utils.py
+++ b/python/paddle/distributed/models/moe/utils.py
@@ -16,7 +16,7 @@
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.framework import _non_static_mode, _in_legacy_dygraph, in_dygraph_mode
 from paddle.fluid.data_feeder import check_variable_and_dtype
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 
 def _number_count(numbers, upper_range):
@@ -42,7 +42,7 @@ def _number_count(numbers, upper_range):
             print(number_count) # the result: [2, 0, 2, 0, 0, 0]
     """
     if in_dygraph_mode():
-        return _C_ops.number_count(numbers, 'upper_range', upper_range)
+        return _legacy_C_ops.number_count(numbers, 'upper_range', upper_range)
     elif _in_legacy_dygraph():
         return core.ops.number_count(numbers, 'upper_range', upper_range)
     else:
@@ -89,7 +89,7 @@ def _assign_pos(x, cum_count):
             print(pos) # the result: (2, 0, 3, 1)
     """
     if in_dygraph_mode():
-        return _C_ops.assign_pos(x, cum_count, cum_count[-1])
+        return _legacy_C_ops.assign_pos(x, cum_count, cum_count[-1])
     elif _in_legacy_dygraph():
         return core.ops.assign_pos(x, cum_count, cum_count[-1])
     else:
@@ -124,7 +124,7 @@ def _random_routing(topk_idx, topk_value, prob, topk=2):
     """
     if topk == 2:
         if in_dygraph_mode():
-            return _C_ops.random_routing(prob, topk_value, topk_idx)
+            return _legacy_C_ops.random_routing(prob, topk_value, topk_idx)
         elif _in_legacy_dygraph():
             return core.ops.random_routing(prob, topk_value, topk_idx)
         else:
@@ -155,8 +155,8 @@ def _limit_by_capacity(expert_count, capacity, n_worker):
             print(out) # the result: [1, 2, 2, 4, 3, 3]
     """
     if in_dygraph_mode():
-        return _C_ops.limit_by_capacity(expert_count, capacity, 'n_worker',
-                                        n_worker)
+        return _legacy_C_ops.limit_by_capacity(expert_count, capacity,
+                                               'n_worker', n_worker)
     elif _in_legacy_dygraph():
         return core.ops.limit_by_capacity(expert_count, capacity, 'n_worker',
                                           n_worker)
@@ -202,8 +202,9 @@ def _prune_gate_by_capacity(gate_idx, expert_count, n_expert, n_worker):
               [1, 3, 3, 3, -1, 2, 1, 1])
     """
     if in_dygraph_mode():
-        return _C_ops.prune_gate_by_capacity(gate_idx, expert_count, "n_expert",
-                                             n_expert, "n_worker", n_worker)
+        return _legacy_C_ops.prune_gate_by_capacity(gate_idx, expert_count,
+                                                    "n_expert", n_expert,
+                                                    "n_worker", n_worker)
     elif _in_legacy_dygraph():
         return core.ops.prune_gate_by_capacity(gate_idx, expert_count,
                                                "n_expert", n_expert, "n_worker",
diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py
index ec8ef80d5da60..6d8454a6e9ed9 100644
--- a/python/paddle/distributed/utils.py
+++ b/python/paddle/distributed/utils.py
@@ -31,7 +31,7 @@
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.framework import _non_static_mode
 from paddle.fluid.data_feeder import check_variable_and_dtype
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 __all__ = [  #noqa
     'get_host_name_ip',
@@ -146,7 +146,7 @@ def global_scatter(x,
 
     ring_id = 0 if group is None else group.id
     if _non_static_mode():
-        return _C_ops.global_scatter(x, local_count, \
+        return _legacy_C_ops.global_scatter(x, local_count, \
                                     global_count,  \
                                     'use_calc_stream', use_calc_stream, \
                                     'ring_id', ring_id)
@@ -259,7 +259,7 @@ def global_gather(x,
 
     ring_id = 0 if group is None else group.id
     if _non_static_mode():
-        return _C_ops.global_gather(x, local_count, \
+        return _legacy_C_ops.global_gather(x, local_count, \
                                     global_count, \
                                     'use_calc_stream', use_calc_stream, \
                                     'ring_id', ring_id)
diff --git a/python/paddle/distribution/categorical.py b/python/paddle/distribution/categorical.py
index fffcd94ad680e..cd44277f3e8a7 100644
--- a/python/paddle/distribution/categorical.py
+++ b/python/paddle/distribution/categorical.py
@@ -17,7 +17,7 @@
 
 import numpy as np
 import paddle
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.distribution import distribution
 from paddle.fluid import core
 from paddle.fluid.data_feeder import (check_dtype, check_type,
diff --git a/python/paddle/distribution/dirichlet.py b/python/paddle/distribution/dirichlet.py
index 050af6069c5e6..6862bf30e06fb 100644
--- a/python/paddle/distribution/dirichlet.py
+++ b/python/paddle/distribution/dirichlet.py
@@ -158,9 +158,9 @@ def _dirichlet(concentration, name=None):
                              ['float32', 'float64'], op_type)
 
     if in_dygraph_mode():
-        return paddle._C_ops.final_state_dirichlet(concentration)
-    elif _in_legacy_dygraph():
         return paddle._C_ops.dirichlet(concentration)
+    elif _in_legacy_dygraph():
+        return paddle._legacy_C_ops.dirichlet(concentration)
     else:
         helper = LayerHelper(op_type, **locals())
         out = helper.create_variable_for_type_inference(
diff --git a/python/paddle/distribution/distribution.py b/python/paddle/distribution/distribution.py
index 901f5e88e0c2f..937b0171722fd 100644
--- a/python/paddle/distribution/distribution.py
+++ b/python/paddle/distribution/distribution.py
@@ -26,7 +26,7 @@
 
 import numpy as np
 import paddle
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid import core
 from paddle.fluid.data_feeder import (check_dtype, check_type,
                                       check_variable_and_dtype, convert_dtype)
@@ -221,8 +221,8 @@ def _check_values_dtype_in_probs(self, param, value):
                 warnings.warn(
                     "dtype of input 'value' needs to be the same as parameters of distribution class. dtype of 'value' will be converted."
                 )
-                return _C_ops.cast(value, 'in_dtype', value.dtype, 'out_dtype',
-                                   param.dtype)
+                return _legacy_C_ops.cast(value, 'in_dtype', value.dtype,
+                                          'out_dtype', param.dtype)
             return value
 
         check_variable_and_dtype(value, 'value', ['float32', 'float64'],
diff --git a/python/paddle/distribution/normal.py b/python/paddle/distribution/normal.py
index 71bc98a72de4b..f248e1a09273d 100644
--- a/python/paddle/distribution/normal.py
+++ b/python/paddle/distribution/normal.py
@@ -16,7 +16,7 @@
 import warnings
 
 import numpy as np
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.distribution import distribution
 from paddle.fluid import core
 from paddle.fluid.data_feeder import (check_dtype, check_type,
diff --git a/python/paddle/distribution/uniform.py b/python/paddle/distribution/uniform.py
index cbc83eba625cd..aa7f0bde4c830 100644
--- a/python/paddle/distribution/uniform.py
+++ b/python/paddle/distribution/uniform.py
@@ -16,7 +16,7 @@
 import warnings
 
 import numpy as np
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.distribution import distribution
 from paddle.fluid import core
 from paddle.fluid.data_feeder import (check_dtype, check_type,
@@ -191,10 +191,10 @@ def log_prob(self, value):
             lb_bool = self.low < value
             ub_bool = value < self.high
 
-            lb = _C_ops.cast(lb_bool, 'in_dtype', lb_bool.dtype, 'out_dtype',
-                             value.dtype)
-            ub = _C_ops.cast(ub_bool, 'in_dtype', ub_bool.dtype, 'out_dtype',
-                             value.dtype)
+            lb = _legacy_C_ops.cast(lb_bool, 'in_dtype', lb_bool.dtype,
+                                    'out_dtype', value.dtype)
+            ub = _legacy_C_ops.cast(ub_bool, 'in_dtype', ub_bool.dtype,
+                                    'out_dtype', value.dtype)
             return nn.log(lb * ub) - nn.log(self.high - self.low)
 
         name = self.name + '_log_prob'
@@ -221,10 +221,10 @@ def probs(self, value):
             lb_bool = self.low < value
             ub_bool = value < self.high
 
-            lb = _C_ops.cast(lb_bool, 'in_dtype', lb_bool.dtype, 'out_dtype',
-                             value.dtype)
-            ub = _C_ops.cast(ub_bool, 'in_dtype', ub_bool.dtype, 'out_dtype',
-                             value.dtype)
+            lb = _legacy_C_ops.cast(lb_bool, 'in_dtype', lb_bool.dtype,
+                                    'out_dtype', value.dtype)
+            ub = _legacy_C_ops.cast(ub_bool, 'in_dtype', ub_bool.dtype,
+                                    'out_dtype', value.dtype)
             return (lb * ub) / (self.high - self.low)
 
         name = self.name + '_probs'
diff --git a/python/paddle/fft.py b/python/paddle/fft.py
index 4b6a93edc447b..5cbc8f5e3beca 100644
--- a/python/paddle/fft.py
+++ b/python/paddle/fft.py
@@ -18,7 +18,7 @@
 from .tensor.attribute import is_complex, is_floating_point, is_integer
 from .tensor.creation import _real_to_complex_dtype, _complex_to_real_dtype
 from .fluid.framework import _in_legacy_dygraph, in_dygraph_mode
-from . import _C_ops
+from . import _C_ops, _legacy_C_ops
 from .fluid.data_feeder import check_variable_and_dtype
 from .fluid.layer_helper import LayerHelper
 
@@ -166,7 +166,7 @@ def fft(x, n=None, axis=-1, norm="backward", name=None):
             by `axis` is used.
         axis (int, optional): Axis used to calculate FFT. If not specified, the last axis 
             is used by default.       
-        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+        norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
             pair and what normalization factor to use. The parameter value must be one 
             of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
             the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies 
@@ -235,7 +235,7 @@ def ifft(x, n=None, axis=-1, norm="backward", name=None):
             by `axis` is used.
         axis (int, optional): Axis used to calculate FFT. If not specified, the last axis 
             is used by default.       
-        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+        norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
             pair and what normalization factor to use. The parameter value must be one 
             of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
             the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies 
@@ -303,6 +303,12 @@ def rfft(x, n=None, axis=-1, norm="backward", name=None):
             the forward/backward  pair of transforms is scaled and with what 
             normalization factor. Include {"backward", "ortho", "forward"}, 
             default value is "backward".
+            
+                - "backward": The factor of forward direction and backward direction are ``1`` and ``1/n`` respectively;
+                - "forward": The factor of forward direction and backward direction are ``1/n`` and ``1`` respectively;
+                - "ortho": The factor of forward direction and backword direction are both ``1/sqrt(n)``.
+                
+            Where ``n`` is the multiplication of each element in  ``s`` .
         name(str, optional): The default value is None.  Normally there is no 
             need for user to set this property. For more information, please 
             refer to :ref:`api_guide_Name` . 
@@ -310,11 +316,10 @@ def rfft(x, n=None, axis=-1, norm="backward", name=None):
     Returns:
         out(Tensor) : complex tensor
 
-    Raises:
-
-
     Examples:
+    
     .. code-block:: python
+    
         import paddle
 
         x = paddle.to_tensor([0.0, 1.0, 0.0, 0.0])
@@ -348,7 +353,7 @@ def irfft(x, n=None, axis=-1, norm="backward", name=None):
             along the ` axis'.
         axis (int, optional): Axis used to calculate FFT. If not specified, the last axis 
             is used by default.       
-        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+        norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
             pair and what normalization factor to use. The parameter value must be one 
             of "forward" or "backward" or "ortho". Default is "backward".
         name (str, optional): The default value is None.  Normally there is no need for user to set 
@@ -365,15 +370,13 @@ def irfft(x, n=None, axis=-1, norm="backward", name=None):
 
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
-            x = np.array([1, -1j, -1])
-            xp = paddle.to_tensor(x)
-            irfft_xp = paddle.fft.irfft(xp).numpy()
-            print(irfft_xp)
-            #  [0. 1. 0. 0.]
-
+            x = paddle.to_tensor([1, -1j, -1])
+            irfft_x = paddle.fft.irfft(x)
+            print(irfft_x)
+            # Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #        [0., 1., 0., 0.])
     """
     return fft_c2r(x, n, axis, norm, forward=False, name=name)
 
@@ -392,7 +395,7 @@ def hfft(x, n=None, axis=-1, norm="backward", name=None):
             along the ` axis'.
         axis (int,optional): Axis used to calculate FFT. If not specified, the last axis 
             is used by default.       
-        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+        norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
             pair and what normalization factor to use. The parameter value must be one 
             of "forward" or "backward" or "ortho". Default is "backward".
         name (str, optional): The default value is None.  Normally there is no need for user to set 
@@ -409,14 +412,13 @@ def hfft(x, n=None, axis=-1, norm="backward", name=None):
 
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
-            x = np.array([1, -1j, -1])
-            xp = paddle.to_tensor(x)
-            hfft_xp = paddle.fft.hfft(xp).numpy()
-            print(hfft_xp)
-            #  [0. 0. 0. 4.]
+            x = paddle.to_tensor([1, -1j, -1])
+            hfft_x = paddle.fft.hfft(x)
+            print(hfft_x)
+            # Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #        [0., 0., 0., 4.])
     """
 
     return fft_c2r(x, n, axis, norm, forward=True, name=name)
@@ -456,7 +458,9 @@ def ihfft(x, n=None, axis=-1, norm="backward", name=None):
         out(Tensor) : complex tensor.
 
     Examples:
+    
     .. code-block:: python
+    
         import paddle 
 
         spectrum = paddle.to_tensor([10.0, -5.0, 0.0, -1.0, 0.0, -5.0])
@@ -490,7 +494,7 @@ def fftn(x, s=None, axes=None, norm="backward", name=None):
             by `axes` is used.
         axes (sequence of ints, optional): Axes used to calculate FFT. If not given, the last ``len(s)``
             axes are used, or all axes if `s` is also not specified.      
-        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+        norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
             pair and what normalization factor to use. The parameter value must be one 
             of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
             the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies 
@@ -570,7 +574,7 @@ def ifftn(x, s=None, axes=None, norm="backward", name=None):
             by `axes` is used.
         axes (sequence of ints, optional): Axes used to calculate FFT. If not given, the last ``len(s)``
             axes are used, or all axes if `s` is also not specified.      
-        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+        norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
             pair and what normalization factor to use. The parameter value must be one 
             of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
             the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies 
@@ -587,18 +591,21 @@ def ifftn(x, s=None, axes=None, norm="backward", name=None):
 
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
-            x = np.eye(3)
-            xp = paddle.to_tensor(x)
-            ifftn_xp = paddle.fft.ifftn(xp, axes=(1,)).numpy()
-            print(ifftn_xp)
-
-            #   [[ 0.33333333+0.j          0.33333333+0.j          0.33333333-0.j        ]
-            #   [ 0.33333333+0.j         -0.16666667+0.28867513j -0.16666667-0.28867513j]
-            #   [ 0.33333333+0.j         -0.16666667-0.28867513j -0.16666667+0.28867513j]]
-
+            x = paddle.eye(3)
+            ifftn_x = paddle.fft.ifftn(x, axes=(1,))
+            print(ifftn_x)
+            # Tensor(shape=[3, 3], dtype=complex64, place=Place(cpu), stop_gradient=True,
+            #        [[ (0.3333333432674408+0j)                  ,
+            #           (0.3333333432674408-0j)                  ,
+            #           (0.3333333432674408+0j)                  ],
+            #         [ (0.3333333432674408+0j)                  ,
+            #          (-0.1666666716337204+0.28867512941360474j),
+            #          (-0.1666666716337204-0.28867512941360474j)],
+            #         [ (0.3333333432674408+0j)                  ,
+            #          (-0.1666666716337204-0.28867512941360474j),
+            #          (-0.1666666716337204+0.28867512941360474j)]])
     """
     if is_integer(x) or is_floating_point(x):
         return fftn_r2c(x,
@@ -630,20 +637,29 @@ def rfftn(x, s=None, axes=None, norm="backward", name=None):
 
     Args:
         x(Tensor) : Input tensor, taken to be real.
-        s(Sequence[int]) : Shape to use from the exec fft. The final element of 
+        s(Sequence[int], optional) : Shape to use from the exec fft. The final element of 
             `s` corresponds to `n` for ``rfft(x, n)``, while for the remaining 
             axes, it corresponds to `n` for ``fft(x, n)``. Along any axis, if 
             the given shape is smaller than that of the input, the input is 
             cropped.  If it is larger, the input is padded with zeros. if `s` is 
             not given, the shape of the input along the axes specified by `axes` 
             is used.
-        axes(Sequence[int]) : Axes over which to compute the FFT.  If not given, 
+        axes(Sequence[int], optional) : Axes over which to compute the FFT.  If not given, 
             the last ``len(s)`` axes are used, or all axes if `s` is also not 
             specified.
         norm(str, optional) : Normalization mode, indicates which direction of 
             the forward/backward pair of transforms is scaled and with what 
             normalization factor. Include {"backward", "ortho", "forward"}, 
-            default value is "backward".
+            default value is "backward". The details of 
+            three operations are shown below:
+            
+                - "backward": The factor of forward direction and backward direction are ``1`` 
+                and ``1/n`` respectively;
+                - "forward": The factor of forward direction and backward direction are ``1/n`` 
+                and ``1`` respectively;
+                - "ortho": The factor of forward direction and backword direction are both ``1/sqrt(n)``.
+                
+            Where ``n`` is the multiplication of each element in  ``s`` .
         name(str, optional): The default value is None.  Normally there is no 
             need for user to set this property. For more information, please 
             refer to :ref:`api_guide_Name` . 
@@ -651,12 +667,10 @@ def rfftn(x, s=None, axes=None, norm="backward", name=None):
     Returns:
         out(Tensor): complex tensor
 
-
-    Raises:
-        ValueError: If `s` and `axes` have different length.
-
     Examples:
+    
     .. code-block:: python
+    
         import paddle
 
         # default, all axis will be used to exec fft
@@ -694,7 +708,7 @@ def irfftn(x, s=None, axes=None, norm="backward", name=None):
     Fourier Transform for real input over any number of axes in an
     M-D array by means of the Fast Fourier Transform (FFT). In
     other words, ``irfftn(rfftn(x), x.shape) == x`` to within numerical
-    accuracy. (The ``a.shape`` is necessary like ``len(a)`` is for `irfft`,
+    accuracy. (The ``x.shape`` is necessary like ``len(x)`` is for `irfft`,
     and for the same reason.)
 
     The input should be ordered in the same way as is returned by `rfftn`,
@@ -704,19 +718,26 @@ def irfftn(x, s=None, axes=None, norm="backward", name=None):
     Args:
         x (Tensor): The input data. It's a Tensor type.
         s (sequence of ints, optional): The length of the output transform axis. 
-            (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.). `s` is also the
-            number of input points used along this axis, except for the last axis,
-            where ``s[-1]//2+1`` points of the input are used. Along any axis, if 
-            the shape indicated by `s` is smaller than that of the input, the input 
-            is cropped. If it is larger, the input is padded with zeros. 
-            If `s` is not given, the shape of the input along the axes specified by axes 
-            is used. Except for the last axis which is taken to be ``2*(k-1)`` where 
-            ``k`` is the length of the input along that axis.
+            (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.). 
+            
+            - `s` is also the number of input points used along this axis, except for the last axis, where ``s[-1]//2+1`` points of the input are used. 
+            - Along any axis, if the shape indicated by `s` is smaller than that of the input, the input is cropped. If it is larger, the input is padded with zeros. 
+            - If `s` is not given, the shape of the input along the axes specified by axes is used. Except for the last axis which is taken to be ``2*(k-1)`` 
+            
+            where ``k`` is the length of the input along that axis.
+            
         axes (sequence of ints, optional): Axes over which to compute the inverse FFT. If not given, the last
             `len(s)` axes are used, or all axes if `s` is also not specified.      
         norm (str): Indicates which direction to scale the `forward` or `backward` transform
             pair and what normalization factor to use. The parameter value must be one 
-            of "forward" or "backward" or "ortho". Default is "backward".
+            of "forward" or "backward" or "ortho". Default is "backward". The details of 
+            three operations are shown below:
+            
+                - "backward": The factor of forward direction and backward direction are ``1`` and ``1/n`` respectively;
+                - "forward": The factor of forward direction and backward direction are ``1/n`` and ``1`` respectively;
+                - "ortho": The factor of forward direction and backword direction are both ``1/sqrt(n)``.
+                
+            Where ``n`` is the multiplication of each element in  ``s`` .
         name (str, optional): The default value is None.  Normally there is no need for user to set 
             this property. For more information, please refer to :ref:`api_guide_Name`. 
     
@@ -733,14 +754,17 @@ def irfftn(x, s=None, axes=None, norm="backward", name=None):
 
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
-            x = (np.array([2, 2, 3]) + 1j * np.array([2, 2, 3])).astype(np.complex128)
-            xp = paddle.to_tensor(x)
-            irfftn_xp = paddle.fft.irfftn(xp).numpy()
-            print(irfftn_xp)
-            #  [ 2.25 -1.25  0.25  0.75]
+            x = paddle.to_tensor([2.+2.j, 2.+2.j, 3.+3.j]).astype(paddle.complex128)
+            print(x)
+            irfftn_x = paddle.fft.irfftn(x)
+            print(irfftn_x)
+            
+            # Tensor(shape=[3], dtype=complex128, place=Place(cpu), stop_gradient=True,
+            #        [(2+2j), (2+2j), (3+3j)])
+            # Tensor(shape=[4], dtype=float64, place=Place(cpu), stop_gradient=True,
+            #        [ 2.25000000, -1.25000000,  0.25000000,  0.75000000])
     
     """
     return fftn_c2r(x, s, axes, norm, forward=False, name=name)
@@ -770,7 +794,7 @@ def hfftn(x, s=None, axes=None, norm="backward", name=None):
             ``k`` is the length of the input along that axis.
         axes (sequence of ints, optional): Axes over which to compute the inverse FFT. If not given, the last
             `len(s)` axes are used, or all axes if `s` is also not specified.      
-        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+        norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
             pair and what normalization factor to use. The parameter value must be one 
             of "forward" or "backward" or "ortho". Default is "backward".
         name (str, optional): The default value is None.  Normally there is no need for user to set 
@@ -784,16 +808,13 @@ def hfftn(x, s=None, axes=None, norm="backward", name=None):
 
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
-            x = (np.array([2, 2, 3]) + 1j * np.array([2, 2, 3])).astype(np.complex128)
-            xp = paddle.to_tensor(x)
-            hfftn_xp = paddle.fft.hfftn(xp).numpy()
-            print(hfftn_xp)
-            #  [ 9.  3.  1. -5.]
-
-
+            x = paddle.to_tensor([(2+2j), (2+2j), (3+3j)])
+            hfftn_x = paddle.fft.hfftn(x)
+            print(hfftn_x)
+            # Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #        [ 9.,  3.,  1., -5.])
     """
     return fftn_c2r(x, s, axes, norm, forward=True, name=name)
 
@@ -814,7 +835,7 @@ def ihfftn(x, s=None, axes=None, norm="backward", name=None):
             of the input, the input is cropped. If it is larger, the input is 
             padded with zeros. if `s` is not given, the shape of the input 
             along the axes specified by `axes` is used.
-        axis(Sequence[int], optional) : Axis over which to compute the inverse FFT. If not
+        axes(Sequence[int], optional) : Axis over which to compute the inverse FFT. If not
             given, the last axis is used.
         norm(str, optional) : Normalization mode, indicates which direction of 
             the forward/backward pair of transforms is scaled and with what 
@@ -828,7 +849,9 @@ def ihfftn(x, s=None, axes=None, norm="backward", name=None):
         out(Tensor) : complex tensor.
 
     Examples:
+    
     .. code-block:: python
+    
         import paddle 
 
         spectrum = paddle.to_tensor([10.0, -5.0, 0.0, -1.0, 0.0, -5.0])
@@ -838,7 +861,6 @@ def ihfftn(x, s=None, axes=None, norm="backward", name=None):
         print(paddle.fft.ihfft(spectrum))
         #  Tensor(shape = [4], dtype = complex64, place = CUDAPlace(0), stop_gradient = True,
         #         [(-0.1666666716337204+0j),  (1-1.9868215517249155e-08j), (2.3333334922790527-1.9868215517249155e-08j),  (3.5+0j)])
-
     """
     return fftn_r2c(x, s, axes, norm, forward=False, onesided=True, name=name)
 
@@ -863,7 +885,7 @@ def fft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
             by `axes` is used. Default is None.
         axes (sequence of ints, optional):  Axes over which to compute the FFT. It should be a 
             sequence of 2 integers. If not specified, the last two axes are used by default.       
-        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+        norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
             pair and what normalization factor to use. The parameter value must be one 
             of "forward" or "backward" or "ortho". Default is "backward".
         name (str, optional): The default value is None.  Normally there is no need for user to set 
@@ -872,11 +894,6 @@ def fft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     Returns:
         Complex tensor. The truncated or zero-padded input, transformed along the axes indicated by `axes`, 
         or the last two axes if `axes` is not given.
-    
-    Raises:
-        ValueError: if `s` not be a sequence of 2 integers or None.
-        ValueError: if `axes` not be a sequence of 2 integers or None.
-        ValueError: If the input dimension is smaller than 2.
 
     Examples:
 
@@ -934,7 +951,7 @@ def ifft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
             by `axes` is used. Default is None.
         axes (sequence of ints, optional):  Axes over which to compute the FFT. It should be a 
             sequence of 2 integers. If not specified, the last two axes are used by default.       
-        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+        norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
             pair and what normalization factor to use. The parameter value must be one 
             of "forward" or "backward" or "ortho". Default is "backward".
         name (str, optional): The default value is None.  Normally there is no need for user to set 
@@ -944,11 +961,6 @@ def ifft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
         Complex tensor. The truncated or zero-padded input, transformed along the axes indicated by `axes`, 
         or the last two axes if `axes` is not given.
 
-    Raises:
-        ValueError: if `s` not be a sequence of 2 integers or None.
-        ValueError: if `axes` not be a sequence of 2 integers or None.
-        ValueError: If the input dimension is smaller than 2.
-
     Examples:
 
         .. code-block:: python
@@ -986,12 +998,19 @@ def rfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
 
     Args:
         x(Tensor): Input tensor, taken to be real.
-        s(Sequence[int]) : Shape of the FFT.
+        s(Sequence[int], optional) : Shape of the FFT.
         axes(Sequence[int], optional): Axes over which to compute the FFT.
         norm(str, optional) : {"backward", "ortho", "forward"}, 
             default is "backward". Indicates which direction of the 
             forward/backward pair of transforms is scaled and with what 
-            normalization factor.
+            normalization factor. The details of 
+            three operations are shown below:
+            
+                - "backward": The factor of forward direction and backward direction are ``1`` and ``1/n`` respectively;
+                - "forward": The factor of forward direction and backward direction are ``1/n`` and ``1`` respectively;
+                - "ortho": The factor of forward direction and backword direction are both ``1/sqrt(n)``.
+                
+            Where ``n`` is the multiplication of each element in  ``s`` .
         name(str, optional): The default value is None.  Normally there is no 
             need for user to set this property. For more information, please 
             refer to :ref:`api_guide_Name` . 
@@ -999,12 +1018,10 @@ def rfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     Returns: 
         out(Tensor): The result of the real 2-D FFT.
 
-    Raises:
-
-
     Examples:
 
     .. code-block:: python
+    
         import paddle
         import numpy as np
 
@@ -1040,34 +1057,34 @@ def irfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
         s (sequence of ints, optional): Shape of the real output to the inverse FFT. Default is None.
         axes (sequence of ints, optional): The axes over which to compute the inverse FFT. Axes 
             must be two-dimensional. If not specified, the last two axes are used by default.       
-        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+        norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
             pair and what normalization factor to use. The parameter value must be one 
-            of "forward" or "backward" or "ortho". Default is "backward".
+            of "forward" or "backward" or "ortho". Default is "backward". The details of 
+            three operations are shown below:
+            
+                - "backward": The factor of forward direction and backward direction are ``1`` and ``1/n`` respectively;
+                - "forward": The factor of forward direction and backward direction are ``1/n`` and ``1`` respectively;
+                - "ortho": The factor of forward direction and backword direction are both ``1/sqrt(n)``.
+                
+            Where ``n`` is the multiplication of each element in  ``s`` .
         name (str, optional): The default value is None.  Normally there is no need for user to set 
             this property. For more information, please refer to :ref:`api_guide_Name` . 
     
     Returns:
         Real tensor. The result of the inverse real 2-D FFT.
-
-    Raises:
-        ValueError: if `s` not be a sequence of 2 integers or None.
-        ValueError: if `axes` not be a sequence of 2 integers or None.
-        ValueError: If the input dimension is smaller than 2.
     
     Examples:
 
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
-            x = (np.array([[3,2,3],[2, 2, 3]]) + 1j * np.array([[3,2,3],[2, 2, 3]])).astype(np.complex128)
-            xp = paddle.to_tensor(x)
-            irfft2_xp = paddle.fft.irfft2(xp).numpy()
-            print(irfft2_xp)
-            #  [[ 2.375 -1.125  0.375  0.875]
-            #   [ 0.125  0.125  0.125  0.125]]
-
+            x = paddle.to_tensor([[3.+3.j, 2.+2.j, 3.+3.j], [2.+2.j, 2.+2.j, 3.+3.j]])
+            irfft2_x = paddle.fft.irfft2(x)
+            print(irfft2_x)
+            # Tensor(shape=[2, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #        [[ 2.37500000, -1.12500000,  0.37500000,  0.87500000],
+            #         [ 0.12500000,  0.12500000,  0.12500000,  0.12500000]])
     """
     _check_at_least_ndim(x, 2)
     if s is not None:
@@ -1101,26 +1118,18 @@ def hfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     Returns:
         Real tensor. The real result of the 2-D Hermitian complex real FFT.
     
-    Raises:
-        ValueError: if `s` not be a sequence of 2 integers or None.
-        ValueError: if `axes` not be a sequence of 2 integers or None.
-        ValueError: If the input dimension is smaller than 2.
-    
     Examples:
 
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
-            x = (np.array([[3,2,3],[2, 2, 3]]) + 1j * np.array([[3,2,3],[2, 2, 3]])).astype(np.complex128)
-            xp = paddle.to_tensor(x)
-            hfft2_xp = paddle.fft.hfft2(xp).numpy()
-            print(hfft2_xp)
-            #  [[19.  7.  3. -9.]
-            #   [ 1.  1.  1.  1.]]
-
-
+            x = paddle.to_tensor([[3.+3.j, 2.+2.j, 3.+3.j], [2.+2.j, 2.+2.j, 3.+3.j]])
+            hfft2_x = paddle.fft.hfft2(x)
+            print(hfft2_x)
+            # Tensor(shape=[2, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #        [[19.,  7.,  3., -9.],
+            #         [ 1.,  1.,  1.,  1.]])
     """
     _check_at_least_ndim(x, 2)
     if s is not None:
@@ -1144,12 +1153,12 @@ def ihfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     For more details see `ihfftn`.
 
     Args:
-        x(Tensor): Input tensor
+        x(Tensor): Input tensor.
         s(Sequence[int], optional): Shape of the real input to the inverse FFT.
         axes(Sequance[int], optional): The axes over which to compute the 
             inverse fft. Default is the last two axes.
         norm(str, optional): {"backward", "ortho", "forward"}. Default is 
-        "backward".
+            "backward".
         name(str, optional): The default value is None.  Normally there is no 
             need for user to set this property. For more information, please 
             refer to :ref:`api_guide_Name` . 
@@ -1254,6 +1263,8 @@ def rfftfreq(n, d=1.0, dtype=None, name=None):
     Args:
         n (int): Dimension inputed.
         d (scalar, optional): Sample spacing (inverse of the sampling rate). Defaults is 1.
+        dtype (str, optional): The data type of returns. Defaults is the data type of returns 
+            of ``paddle.get_default_dtype()``.
         name (str, optional): The default value is None.  Normally there is no need for user to set 
             this property. For more information, please refer to :ref:`api_guide_Name`.
 
@@ -1393,10 +1404,10 @@ def fft_c2c(x, n, axis, norm, forward, name):
 
     check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], op_type)
     if in_dygraph_mode():
-        out = _C_ops.final_state_fft_c2c(x, axes, norm, forward)
+        out = _C_ops.fft_c2c(x, axes, norm, forward)
     elif _in_legacy_dygraph():
         attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
-        out = getattr(_C_ops, op_type)(x, *attrs)
+        out = getattr(_legacy_C_ops, op_type)(x, *attrs)
     else:
         inputs = {
             'X': [x],
@@ -1429,11 +1440,11 @@ def fft_r2c(x, n, axis, norm, forward, onesided, name):
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], op_type)
 
     if in_dygraph_mode():
-        out = _C_ops.final_state_fft_r2c(x, axes, norm, forward, onesided)
+        out = _C_ops.fft_r2c(x, axes, norm, forward, onesided)
     elif _in_legacy_dygraph():
         attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
                  'onesided', onesided)
-        out = getattr(_C_ops, op_type)(x, *attrs)
+        out = getattr(_legacy_C_ops, op_type)(x, *attrs)
     else:
         inputs = {
             'X': [x],
@@ -1475,16 +1486,16 @@ def fft_c2r(x, n, axis, norm, forward, name):
 
     if in_dygraph_mode():
         if n is not None:
-            out = _C_ops.final_state_fft_c2r(x, axes, norm, forward, n)
+            out = _C_ops.fft_c2r(x, axes, norm, forward, n)
         else:
-            out = _C_ops.final_state_fft_c2r(x, axes, norm, forward, 0)
+            out = _C_ops.fft_c2r(x, axes, norm, forward, 0)
     elif _in_legacy_dygraph():
         if n is not None:
             attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
                      'last_dim_size', n)
         else:
             attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
-        out = getattr(_C_ops, op_type)(x, *attrs)
+        out = getattr(_legacy_C_ops, op_type)(x, *attrs)
     else:
         inputs = {
             'X': [x],
@@ -1538,10 +1549,10 @@ def fftn_c2c(x, s, axes, norm, forward, name):
     check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], op_type)
 
     if in_dygraph_mode():
-        out = _C_ops.final_state_fft_c2c(x, axes, norm, forward)
+        out = _C_ops.fft_c2c(x, axes, norm, forward)
     elif _in_legacy_dygraph():
         attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
-        out = getattr(_C_ops, op_type)(x, *attrs)
+        out = getattr(_legacy_C_ops, op_type)(x, *attrs)
     else:
         inputs = {
             'X': [x],
@@ -1591,11 +1602,11 @@ def fftn_r2c(x, s, axes, norm, forward, onesided, name):
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], op_type)
 
     if in_dygraph_mode():
-        out = _C_ops.final_state_fft_r2c(x, axes, norm, forward, onesided)
+        out = _C_ops.fft_r2c(x, axes, norm, forward, onesided)
     elif _in_legacy_dygraph():
         attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
                  'onesided', onesided)
-        out = getattr(_C_ops, op_type)(x, *attrs)
+        out = getattr(_legacy_C_ops, op_type)(x, *attrs)
     else:
         inputs = {
             'X': [x],
@@ -1657,16 +1668,16 @@ def fftn_c2r(x, s, axes, norm, forward, name):
 
     if in_dygraph_mode():
         if s is not None:
-            out = _C_ops.final_state_fft_c2r(x, axes, norm, forward, s[-1])
+            out = _C_ops.fft_c2r(x, axes, norm, forward, s[-1])
         else:
-            out = _C_ops.final_state_fft_c2r(x, axes, norm, forward, 0)
+            out = _C_ops.fft_c2r(x, axes, norm, forward, 0)
     elif _in_legacy_dygraph():
         if s:
             attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
                      'last_dim_size', s[-1])
         else:
             attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
-        out = getattr(_C_ops, op_type)(x, *attrs)
+        out = getattr(_legacy_C_ops, op_type)(x, *attrs)
     else:
         inputs = {
             'X': [x],
diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index b5575928c4faf..24f69a86662d6 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -29,7 +29,7 @@
 from .framework import _non_static_mode, in_dygraph_mode, _in_legacy_dygraph
 from .layer_helper import LayerHelper
 from .framework import default_main_program
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 __all__ = [
     'set_gradient_clip', 'ErrorClipByValue', 'ClipGradByValue',
@@ -71,9 +71,9 @@ def _squared_l2_norm(x):
         return sum_square
 
     if in_dygraph_mode():
-        return _C_ops.final_state_squared_l2_norm(x)
-    elif _in_legacy_dygraph():
         return _C_ops.squared_l2_norm(x)
+    elif _in_legacy_dygraph():
+        return _legacy_C_ops.squared_l2_norm(x)
 
     op_type = 'squared_l2_norm'
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], op_type)
diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py
index 44b622807bcc7..90bf501ed5c17 100644
--- a/python/paddle/fluid/contrib/layers/nn.py
+++ b/python/paddle/fluid/contrib/layers/nn.py
@@ -36,7 +36,7 @@
 from paddle.fluid.framework import Variable, convert_np_dtype_to_dtype_
 from paddle.fluid.layers import slice, reshape
 import warnings
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 __all__ = [
     'fused_elemwise_activation', 'sequence_topk_avg_pooling', 'var_conv_2d',
@@ -1783,7 +1783,7 @@ def bilateral_slice(x, guide, grid, has_offset, name=None):
     """
     if paddle.fluid._non_static_mode():
         attrs = ('has_offset', has_offset)
-        return getattr(_C_ops, "bilateral_slice")(x, grid, guide, *attrs)
+        return getattr(_legacy_C_ops, "bilateral_slice")(x, grid, guide, *attrs)
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'bilateral_slice')
     check_variable_and_dtype(guide, 'guide', ['float32', 'float64'],
@@ -1858,7 +1858,7 @@ def correlation(x,
         attrs = ("pad_size", pad_size, "kernel_size", kernel_size,
                  "max_displacement", max_displacement, "stride1", stride1,
                  "stride2", stride2, "corr_type_multiply", corr_type_multiply)
-        output = getattr(_C_ops, "correlation")(x, y, *attrs)
+        output = getattr(_legacy_C_ops, "correlation")(x, y, *attrs)
     else:
         helper = LayerHelper("correlation", **locals())
         output = helper.create_variable_for_type_inference(dtype=x.dtype)
diff --git a/python/paddle/fluid/contrib/optimizer.py b/python/paddle/fluid/contrib/optimizer.py
index 9265198485c78..67e972cf3e231 100644
--- a/python/paddle/fluid/contrib/optimizer.py
+++ b/python/paddle/fluid/contrib/optimizer.py
@@ -21,7 +21,7 @@
 from paddle.fluid import layers
 from paddle.fluid.layer_helper import LayerHelper
 import warnings
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 __all__ = ['Momentum']
 
@@ -207,7 +207,7 @@ def _append_optimize_op(self, block, param_and_grad):
                          if find_master else None)
 
         if framework._non_static_mode():
-            _, _, _ = _C_ops.momentum(
+            _, _, _ = _legacy_C_ops.momentum(
                 param_and_grad[0], param_and_grad[1], velocity_acc, lr,
                 master_weight, param_and_grad[0], velocity_acc, master_weight,
                 'mu', self._momentum, 'use_nesterov', self._use_nesterov,
diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
index 7c48e29ebc94b..668aa12210cea 100644
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -37,8 +37,9 @@
 from . import utils
 
 __all__ = [
-    'PostTrainingQuantization', 'WeightQuantization',
-    'PostTrainingQuantizationProgram'
+    'PostTrainingQuantization',
+    'WeightQuantization',
+    'PostTrainingQuantizationProgram',
 ]
 
 _logger = get_logger(__name__,
@@ -325,6 +326,7 @@ def __init__(self,
         self._activation_quantize_type = activation_quantize_type
         self._weight_quantize_type = weight_quantize_type
         self._onnx_format = onnx_format
+        self._clip_extra = True if self._onnx_format else False
         self._skip_tensor_list = skip_tensor_list
         self._is_full_quantize = is_full_quantize
         if is_full_quantize:
@@ -504,7 +506,6 @@ def save_quantized_model(self,
         Returns:
             None
         '''
-        clip_extra = True if self._onnx_format else False
         io.save_inference_model(dirname=save_model_path,
                                 model_filename=model_filename,
                                 params_filename=params_filename,
@@ -512,7 +513,7 @@ def save_quantized_model(self,
                                 target_vars=self._fetch_list,
                                 executor=self._executor,
                                 main_program=self._program,
-                                clip_extra=clip_extra)
+                                clip_extra=self._clip_extra)
         _logger.info("The quantized model is saved in " + save_model_path)
 
     def _load_model_data(self):
@@ -534,6 +535,8 @@ def _load_model_data(self):
             for var_name in self._feed_list]
 
         if self._data_loader is not None:
+            self._batch_nums = self._batch_nums if self._batch_nums else len(
+                self._data_loader)
             return
         self._data_loader = io.DataLoader.from_generator(feed_list=feed_vars,
                                                          capacity=3 *
@@ -547,6 +550,8 @@ def _load_model_data(self):
         elif self._batch_generator is not None:
             self._data_loader.set_batch_generator(self._batch_generator,
                                                   places=self._place)
+        self._batch_nums = self._batch_nums if self._batch_nums else len(
+            list(self._data_loader))
 
     def _optimize_fp32_model(self):
         '''
@@ -630,7 +635,6 @@ def _reset_activation_persistable(self):
             if var.name in self._quantized_act_var_name:
                 var.persistable = False
                 to_erase.append(var.name)
-        self._scope.erase(to_erase)
 
     def _sampling(self):
         '''
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_lstm_model.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_lstm_model.py
index 6100ed4f82a0e..575d0826b27c3 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_lstm_model.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_lstm_model.py
@@ -191,6 +191,8 @@ def generate_quantized_model(self,
                                        onnx_format=onnx_format,
                                        is_use_cache_file=is_use_cache_file)
         ptq.quantize()
+        if onnx_format:
+            ptq._clip_extra = False
         ptq.save_quantized_model(self.int8_model_path)
 
     def run_test(self,
@@ -226,7 +228,7 @@ def run_test(self,
         self.generate_quantized_model(fp32_model_path, data_path, algo,
                                       round_type, quantizable_op_type,
                                       is_full_quantize, is_use_cache_file,
-                                      is_optimize_model, quant_iterations,
+                                      is_optimize_model, 10, quant_iterations,
                                       onnx_format)
 
         print("Start INT8 inference for {0} on {1} samples ...".format(
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
index cb6d685f721d6..fc675ed4a07d8 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
@@ -246,6 +246,7 @@ def generate_quantized_model(self,
                                  is_full_quantize=False,
                                  is_use_cache_file=False,
                                  is_optimize_model=False,
+                                 batch_nums=10,
                                  onnx_format=False):
         try:
             os.system("mkdir " + self.int8_model)
@@ -263,6 +264,7 @@ def generate_quantized_model(self,
                                        sample_generator=val_reader,
                                        model_dir=model_path,
                                        algo=algo,
+                                       batch_nums=batch_nums,
                                        quantizable_op_type=quantizable_op_type,
                                        round_type=round_type,
                                        is_full_quantize=is_full_quantize,
@@ -283,7 +285,8 @@ def run_test(self,
                  is_use_cache_file,
                  is_optimize_model,
                  diff_threshold,
-                 onnx_format=False):
+                 onnx_format=False,
+                 batch_nums=10):
         infer_iterations = self.infer_iterations
         batch_size = self.batch_size
         sample_iterations = self.sample_iterations
@@ -301,7 +304,8 @@ def run_test(self,
         self.generate_quantized_model(os.path.join(model_cache_folder, "model"),
                                       quantizable_op_type, algo, round_type,
                                       is_full_quantize, is_use_cache_file,
-                                      is_optimize_model, onnx_format)
+                                      is_optimize_model, batch_nums,
+                                      onnx_format)
 
         print("Start INT8 inference for {0} on {1} images ...".format(
             model, infer_iterations * batch_size))
@@ -392,9 +396,18 @@ def test_post_training_hist_mobilenetv1(self):
         is_use_cache_file = False
         is_optimize_model = True
         diff_threshold = 0.03
-        self.run_test(model, algo, round_type, data_urls, data_md5s,
-                      quantizable_op_type, is_full_quantize, is_use_cache_file,
-                      is_optimize_model, diff_threshold)
+        batch_nums = 3
+        self.run_test(model,
+                      algo,
+                      round_type,
+                      data_urls,
+                      data_md5s,
+                      quantizable_op_type,
+                      is_full_quantize,
+                      is_use_cache_file,
+                      is_optimize_model,
+                      diff_threshold,
+                      batch_nums=batch_nums)
 
 
 class TestPostTrainingAbsMaxForMobilenetv1(TestPostTrainingQuantization):
@@ -441,6 +454,7 @@ def test_post_training_onnx_format_mobilenetv1(self):
         is_optimize_model = True
         onnx_format = True
         diff_threshold = 0.05
+        batch_nums = 3
         self.run_test(model,
                       algo,
                       round_type,
@@ -451,7 +465,8 @@ def test_post_training_onnx_format_mobilenetv1(self):
                       is_use_cache_file,
                       is_optimize_model,
                       diff_threshold,
-                      onnx_format=onnx_format)
+                      onnx_format=onnx_format,
+                      batch_nums=batch_nums)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/dataloader/dataset.py b/python/paddle/fluid/dataloader/dataset.py
index 1c4728b422362..bd3bb87a79fdb 100755
--- a/python/paddle/fluid/dataloader/dataset.py
+++ b/python/paddle/fluid/dataloader/dataset.py
@@ -411,7 +411,7 @@ class Subset(Dataset):
         indices (sequence): Indices in the whole set selected for subset.
 
     Returns:
-        Dataset: A Dataset which is the subset of the original dataset.
+        List[Dataset]: A Dataset which is the subset of the original dataset.
     
     Examples:
 
diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py
index 2ce4508647451..aeb4c730975a6 100644
--- a/python/paddle/fluid/dygraph/amp/loss_scaler.py
+++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py
@@ -20,7 +20,7 @@
 from ...wrapped_decorator import signature_safe_contextmanager, wrap_decorator
 import warnings
 import numpy as np
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from collections import defaultdict
 from enum import Enum
 
@@ -286,26 +286,26 @@ def _unscale(self, optimizer):
                 if param.dtype == core.VarDesc.VarType.FP32
             ]
         if core.is_compiled_with_npu():
-            float_status = _C_ops.alloc_float_status()
-            _C_ops.clear_float_status(float_status, float_status)
+            float_status = _legacy_C_ops.alloc_float_status()
+            _legacy_C_ops.clear_float_status(float_status, float_status)
 
             if len(param_grads_fp16):
-                _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
-                                                float_status, param_grads_fp16,
-                                                self._temp_found_inf_fp16)
+                _legacy_C_ops.check_finite_and_unscale(
+                    param_grads_fp16, self._scale, float_status,
+                    param_grads_fp16, self._temp_found_inf_fp16)
             if len(param_grads_fp32):
-                _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
-                                                float_status, param_grads_fp32,
-                                                self._temp_found_inf_fp32)
+                _legacy_C_ops.check_finite_and_unscale(
+                    param_grads_fp32, self._scale, float_status,
+                    param_grads_fp32, self._temp_found_inf_fp32)
         else:
             if len(param_grads_fp16):
-                _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
-                                                param_grads_fp16,
-                                                self._temp_found_inf_fp16)
+                _legacy_C_ops.check_finite_and_unscale(
+                    param_grads_fp16, self._scale, param_grads_fp16,
+                    self._temp_found_inf_fp16)
             if len(param_grads_fp32):
-                _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
-                                                param_grads_fp32,
-                                                self._temp_found_inf_fp32)
+                _legacy_C_ops.check_finite_and_unscale(
+                    param_grads_fp32, self._scale, param_grads_fp32,
+                    self._temp_found_inf_fp32)
 
         self._found_inf = self._temp_found_inf_fp16 or self._temp_found_inf_fp32
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
index 23744db61a11d..db778a39ecfdf 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
@@ -30,7 +30,7 @@
 from paddle.fluid.contrib.mixed_precision.fp16_utils import rewrite_program, cast_model_to_fp16
 from paddle.fluid.dygraph.amp.auto_cast import _in_amp_guard, _in_pure_fp16_guard
 import paddle.compat as cpt
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 
 class NestSequence(object):
@@ -442,10 +442,11 @@ def __call__(self, inputs):
 
         self._cast_fp16_if_pure_fp16(in_vars)
 
-        _C_ops.run_program(self._valid_vars(in_vars),
-                           self._valid_vars(self._params),
-                           self._valid_vars(out_vars), self._create_scope_vec(),
-                           self._double_grads, self._cuda_graph_vec, *attrs)
+        _legacy_C_ops.run_program(self._valid_vars(in_vars),
+                                  self._valid_vars(self._params),
+                                  self._valid_vars(out_vars),
+                                  self._create_scope_vec(), self._double_grads,
+                                  self._cuda_graph_vec, *attrs)
         restored_nest_out = self._restore_out(out_vars)
         return self._remove_no_value(restored_nest_out)
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
index d3db7209c659d..09a249ed6b10f 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
@@ -148,7 +148,12 @@ def create_undefined_var_like(variable):
     from paddle.fluid.dygraph.dygraph_to_static.return_transformer import RETURN_NO_VALUE_MAGIC_NUM
     var = data_layer_not_check(unique_name.generate("undefined_var"),
                                variable.shape, variable.dtype)
+    var.stop_gradient = False
+    helper = LayerHelper('create_undefined_var_like', **locals())
+    saved_block_ids = helper.main_program.current_block_idx
+    helper.main_program.current_block_idx = 0
     assign(RETURN_NO_VALUE_MAGIC_NUM, var)
+    helper.main_program.current_block_idx = saved_block_ids
     return var
 
 
@@ -156,6 +161,7 @@ def create_undefined_variable():
     from paddle.fluid.dygraph.dygraph_to_static.return_transformer import RETURN_NO_VALUE_MAGIC_NUM
     var = data_layer_not_check(unique_name.generate("undefined_var"), [1],
                                "float64")
+    var.stop_gradient = False
     # the variable is created in block(0), we append assign in block(0) either.
     helper = LayerHelper('create_undefined_variable', **locals())
     saved_block_ids = helper.main_program.current_block_idx
diff --git a/python/paddle/fluid/dygraph/inplace_utils.py b/python/paddle/fluid/dygraph/inplace_utils.py
index 14e875b8b06c4..968a957b660d3 100644
--- a/python/paddle/fluid/dygraph/inplace_utils.py
+++ b/python/paddle/fluid/dygraph/inplace_utils.py
@@ -16,7 +16,7 @@
 from ..framework import _non_static_mode
 import warnings
 import paddle
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 
 # NOTE(pangyoki): The Inplace APIs with underline(`_`) is only valid for the method of calling `_C_ops`
diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py
index 7f91a15ff0149..f470efa12f1a8 100644
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -30,7 +30,7 @@
 from paddle.fluid.layers.utils import _hash_with_id
 from paddle.fluid.dygraph.base import switch_to_static_graph
 from paddle.fluid.framework import _non_static_mode
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 __all__ = ['TranslatedLayer']
 
@@ -865,9 +865,10 @@ def _run_dygraph(instance, input, program_holder):
     attrs = ('global_block', trace_program.block(0), 'start_op_index', 0,
              'end_op_index', end_op_index, 'is_test', instance._is_test,
              'program_id', _hash_with_id(trace_program, instance))
-    _C_ops.run_program(_valid_vars(input_vars), _valid_vars(persistable_vars),
-                       _valid_vars(output_vars), tmp_scope_vec,
-                       _valid_vars(double_grad_vars), None, *attrs)
+    _legacy_C_ops.run_program(_valid_vars(input_vars),
+                              _valid_vars(persistable_vars),
+                              _valid_vars(output_vars), tmp_scope_vec,
+                              _valid_vars(double_grad_vars), None, *attrs)
     # NOTE: [ why need set param's gradient type here ]
     # if user set sparse gradient mode, the param's gradient
     # will be SelectedRows, not LoDTensor. But tracer will just
diff --git a/python/paddle/fluid/dygraph/math_op_patch.py b/python/paddle/fluid/dygraph/math_op_patch.py
index 1f6ce480887c4..b66d1ee302231 100644
--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
@@ -22,7 +22,7 @@
 
 import numpy as np
 import warnings
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 _supported_int_dtype_ = [
     core.VarDesc.VarType.UINT8,
@@ -72,12 +72,13 @@ def monkey_patch_math_varbase():
     @no_grad
     def create_tensor(value, dtype, shape):
         if framework._in_eager_mode_:
-            out = _C_ops.final_state_full(shape, value, dtype,
-                                          framework._current_expected_place())
+            out = _C_ops.full(shape, value, dtype,
+                              framework._current_expected_place())
         else:
             out = _varbase_creator(dtype=dtype)
-            out = _C_ops.fill_constant(out, 'dtype', dtype, 'shape', shape,
-                                       'value', value, 'force_cpu', False)
+            out = _legacy_C_ops.fill_constant(out, 'dtype', dtype, 'shape',
+                                              shape, 'value', value,
+                                              'force_cpu', False)
         out.stop_gradient = True
         return out
 
@@ -111,13 +112,14 @@ def astype(self, dtype):
             dtype = convert_np_dtype_to_dtype_(dtype)
 
         if _in_legacy_dygraph():
-            return _C_ops.cast(self, 'in_dtype', self.dtype, 'out_dtype', dtype)
-        return _C_ops.final_state_cast(self, dtype)
+            return _legacy_C_ops.cast(self, 'in_dtype', self.dtype, 'out_dtype',
+                                      dtype)
+        return _C_ops.cast(self, dtype)
 
     def _scalar_elementwise_op_(var, scale, bias):
         if framework.in_dygraph_mode():
-            return _C_ops.final_state_scale(var, float(scale), bias, True)
-        return _C_ops.scale(var, 'scale', scale, 'bias', bias)
+            return _C_ops.scale(var, float(scale), bias, True)
+        return _legacy_C_ops.scale(var, 'scale', scale, 'bias', bias)
 
     def _neg_(var):
         return _scalar_elementwise_op_(var, -1.0, 0.0)
@@ -174,9 +176,9 @@ def _T_(var):
         for i in range(len(var.shape)):
             perm.insert(0, i)
         if _in_legacy_dygraph():
-            out, _ = _C_ops.transpose2(var, 'axis', perm)
+            out, _ = _legacy_C_ops.transpose2(var, 'axis', perm)
         else:
-            out = _C_ops.final_state_transpose(var, perm)
+            out = _C_ops.transpose(var, perm)
         return out
 
     def _scalar_add_(var, value):
@@ -223,8 +225,7 @@ def __impl__(self, other_var):
                 # so the calculation result here and the calculation result of numpy are
                 # different after 6 decimal point. If necessary, we can also use float64 here.
                 # torch's behavior here is consistent with ours
-                if (op_type == "final_state_divide"
-                        or op_type == "elementwise_div"
+                if (op_type == "divide" or op_type == "elementwise_div"
                     ) and self.dtype in _supported_int_dtype_:
                     self = astype(self, 'float32')
                 # here use `scale` replace `elementwise` to get better performance
@@ -281,17 +282,25 @@ def __impl__(self, other_var):
                 self = other_var
                 other_var = tmp
 
-            if (op_type == "final_state_divide" or op_type == "elementwise_div"
+            if (op_type == "divide" or op_type == "elementwise_div"
                 ) and self.dtype in _supported_int_dtype_:
                 self = astype(self, 'float32')
                 other_var = astype(other_var, 'float32')
 
             # 4. calculation
             axis = -1
-            math_op = getattr(_C_ops, op_type)
+            if in_dygraph_mode():
+                math_op = getattr(_C_ops, op_type)
+            else:
+                math_op = getattr(_legacy_C_ops, op_type)
             if call_final_api:
-                if op_type == "final_state_matmul":
+                if op_type == "matmul":
                     return math_op(self, other_var, False, False)
+                if op_type == "pow":
+                    if isinstance(other_var, core.eager.Tensor):
+                        return _C_ops.elementwise_pow(self, other_var)
+                    else:
+                        return _C_ops.elementwise_pow(self, other_var)
                 return math_op(self, other_var, -1)
             return math_op(self, other_var, 'axis', axis)
 
@@ -324,104 +333,95 @@ def __impl__(self, other_var):
         ('ndim', _ndim_),
         ('size', _size_),
         ('T', _T_),
-        ('__add__',
-         _binary_creator_('__add__', 'final_state_add', False, _scalar_add_,
-                          True)) if framework._in_eager_mode_ else
+        ('__add__', _binary_creator_('__add__', 'add', False, _scalar_add_,
+                                     True)) if framework._in_eager_mode_ else
         ('__add__',
          _binary_creator_('__add__', 'elementwise_add', False, _scalar_add_)),
         ##  a+b == b+a. Do not need to reverse explicitly
         ('__radd__',
-         _binary_creator_('__radd__', 'final_state_add', False, _scalar_add_,
-                          True)) if framework._in_eager_mode_ else
+         _binary_creator_('__radd__', 'add', False, _scalar_add_, True))
+        if framework._in_eager_mode_ else
         ('__radd__',
          _binary_creator_('__radd__', 'elementwise_add', False, _scalar_add_)),
         ('__sub__',
-         _binary_creator_('__sub__', 'final_state_subtract', False,
-                          _scalar_sub_, True)) if framework._in_eager_mode_ else
+         _binary_creator_('__sub__', 'subtract', False, _scalar_sub_, True))
+        if framework._in_eager_mode_ else
         ('__sub__',
          _binary_creator_('__sub__', 'elementwise_sub', False, _scalar_sub_)),
         ('__rsub__',
-         _binary_creator_('__rsub__', 'final_state_subtract', True,
-                          _scalar_rsub_, True))
+         _binary_creator_('__rsub__', 'subtract', True, _scalar_rsub_, True))
         if framework._in_eager_mode_ else
         ('__rsub__',
          _binary_creator_('__rsub__', 'elementwise_sub', True, _scalar_rsub_)),
         ('__mul__',
-         _binary_creator_('__mul__', 'final_state_multiply', False,
-                          _scalar_mul_, True)) if framework._in_eager_mode_ else
+         _binary_creator_('__mul__', 'multiply', False, _scalar_mul_, True))
+        if framework._in_eager_mode_ else
         ('__mul__',
          _binary_creator_('__mul__', 'elementwise_mul', False, _scalar_mul_)),
         ## a*b == b*a. Do not need to reverse explicitly
         ('__rmul__',
-         _binary_creator_('__rmul__', 'final_state_multiply', False,
-                          _scalar_mul_, True)) if framework._in_eager_mode_ else
+         _binary_creator_('__rmul__', 'multiply', False, _scalar_mul_, True))
+        if framework._in_eager_mode_ else
         ('__rmul__',
          _binary_creator_('__rmul__', 'elementwise_mul', False, _scalar_mul_)),
         ('__div__',
-         _binary_creator_('__div__', 'final_state_divide', False, _scalar_div_,
-                          True)) if framework._in_eager_mode_ else
+         _binary_creator_('__div__', 'divide', False, _scalar_div_, True))
+        if framework._in_eager_mode_ else
         ('__div__',
          _binary_creator_('__div__', 'elementwise_div', False, _scalar_div_)),
         ('__truediv__',
-         _binary_creator_('__truediv__', 'final_state_divide', False,
-                          _scalar_div_, True)) if framework._in_eager_mode_ else
+         _binary_creator_('__truediv__', 'divide', False, _scalar_div_, True))
+        if framework._in_eager_mode_ else
         ('__truediv__',
          _binary_creator_('__truediv__', 'elementwise_div', False,
                           _scalar_div_)),
-        ('__rdiv__',
-         _binary_creator_('__rdiv__', 'final_state_divide', True, None, True))
+        ('__rdiv__', _binary_creator_('__rdiv__', 'divide', True, None, True))
         if framework._in_eager_mode_ else
         ('__rdiv__',
          _binary_creator_('__rdiv__', 'elementwise_div', True, None)),
         ('__rtruediv__',
-         _binary_creator_('rtruediv__', 'final_state_divide', True, None, True))
+         _binary_creator_('rtruediv__', 'divide', True, None, True))
         if framework._in_eager_mode_ else
         ('__rtruediv__',
          _binary_creator_('rtruediv__', 'elementwise_div', True, None)),
-        ('__pow__',
-         _binary_creator_('__pow__', 'final_state_elementwise_pow', False,
-                          _C_ops.final_state_pow, True))
+        ('__pow__', _binary_creator_('__pow__', 'pow', False, _C_ops.pow, True))
         if framework._in_eager_mode_ else
         ('__pow__',
          _binary_creator_('__pow__', 'elementwise_pow', False, None)),
         ('__rpow__', _binary_creator_('__rpow__', 'elementwise_pow', True,
                                       None)),
+        ('__floordiv__',
+         _binary_creator_('__floordiv__', 'floor_divide', False, None, True))
+        if framework._in_eager_mode_ else
         ('__floordiv__',
          _binary_creator_('__floordiv__', 'elementwise_floordiv', False, None)),
-        ('__mod__',
-         _binary_creator_('__mod__', 'final_state_modulo', False, None, True))
+        ('__mod__', _binary_creator_('__mod__', 'modulo', False, None, True))
         if framework._in_eager_mode_ else
         ('__mod__',
          _binary_creator_('__mod__', 'elementwise_mod', False, None)),
         ('__matmul__',
-         _binary_creator_('__matmul__', "final_state_matmul", False, None,
-                          True)) if framework._in_eager_mode_ else
+         _binary_creator_('__matmul__', "matmul", False, None, True))
+        if framework._in_eager_mode_ else
         ('__matmul__',
          _binary_creator_('__matmul__', "matmul_v2", False, None)),
         ## for logical compare
-        ('__eq__',
-         _binary_creator_('__eq__', 'final_state_equal', False, None, True))
+        ('__eq__', _binary_creator_('__eq__', 'equal', False, None, True))
         if framework._in_eager_mode_ else
         ('__eq__', _binary_creator_('__eq__', 'equal', False, None)),
-        ('__ne__',
-         _binary_creator_('__ne__', 'final_state_not_equal', False, None, True))
+        ('__ne__', _binary_creator_('__ne__', 'not_equal', False, None, True))
         if framework._in_eager_mode_ else
         ('__ne__', _binary_creator_('__ne__', 'not_equal', False, None)),
-        ('__lt__',
-         _binary_creator_('__lt__', 'final_state_less_than', False, None, True))
+        ('__lt__', _binary_creator_('__lt__', 'less_than', False, None, True))
         if framework._in_eager_mode_ else
         ('__lt__', _binary_creator_('__lt__', 'less_than', False, None)),
-        ('__le__',
-         _binary_creator_('__le__', 'final_state_less_equal', False, None,
-                          True)) if framework._in_eager_mode_ else
+        ('__le__', _binary_creator_('__le__', 'less_equal', False, None, True))
+        if framework._in_eager_mode_ else
         ('__le__', _binary_creator_('__le__', 'less_equal', False, None)),
-        ('__gt__',
-         _binary_creator_('__gt__', 'final_state_greater_than', False, None,
-                          True)) if framework._in_eager_mode_ else
+        ('__gt__', _binary_creator_('__gt__', 'greater_than', False, None,
+                                    True)) if framework._in_eager_mode_ else
         ('__gt__', _binary_creator_('__gt__', 'greater_than', False, None)),
-        ('__ge__',
-         _binary_creator_('__ge__', 'final_state_greater_equal', False, None,
-                          True)) if framework._in_eager_mode_ else
+        ('__ge__', _binary_creator_('__ge__', 'greater_equal', False, None,
+                                    True)) if framework._in_eager_mode_ else
         ('__ge__', _binary_creator_('__ge__', 'greater_equal', False, None)),
         ('__array_ufunc__', None)
     ]
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 0f250fbd87091..a21edd8cb5744 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -33,7 +33,7 @@
 import logging
 import os
 import paddle.utils.deprecated as deprecated
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 __all__ = [
     'Conv2D', 'Conv3D', 'Pool2D', 'Linear', 'BatchNorm', 'Dropout', 'Embedding',
@@ -240,10 +240,10 @@ def _get_default_param_initializer():
 
     def forward(self, input):
         if in_dygraph_mode() and self._l_type == "conv2d":
-            pre_bias = _C_ops.final_state_conv2d(
-                input, self.weight, self._stride, self._padding, "EXPLICIT",
-                self._groups if self._groups else 1, self._dilation, "NCHW",
-                False, -1, False)
+            pre_bias = _C_ops.conv2d(input, self.weight, self._stride,
+                                     self._padding, "EXPLICIT",
+                                     self._groups if self._groups else 1,
+                                     self._dilation, "NCHW", False, -1, False)
             if self.bias is not None:
                 pre_act = F.elementwise_add(pre_bias, self.bias, axis=1)
             else:
@@ -257,7 +257,7 @@ def forward(self, input):
                      'dilations', self._dilation, 'groups',
                      self._groups if self._groups else 1, 'use_cudnn',
                      self._use_cudnn, 'use_mkldnn', self._use_mkldnn)
-            out = _C_ops.conv2d(input, self.weight, *attrs)
+            out = _legacy_C_ops.conv2d(input, self.weight, *attrs)
             pre_bias = out
 
             pre_act = dygraph_utils._append_bias_in_dygraph(
@@ -892,7 +892,7 @@ def forward(self, input):
                      'use_cudnn', self._use_cudnn, 'ceil_mode', self._ceil_mode,
                      'use_mkldnn', self._use_mkldnn, 'exclusive',
                      self._exclusive, 'data_format', self._data_format)
-            return _C_ops.pool2d(input, *attrs)
+            return _legacy_C_ops.pool2d(input, *attrs)
 
         check_variable_and_dtype(
             input, 'input', ['int8', 'uint8', 'float16', 'float32', 'float64'],
@@ -997,9 +997,9 @@ def __init__(self,
     def forward(self, input):
         if _non_static_mode():
             pre_bias = _varbase_creator(dtype=input.dtype)
-            _C_ops.matmul(input, self.weight, pre_bias, 'transpose_X', False,
-                          'transpose_Y', False, "alpha", 1, "use_mkldnn",
-                          self._use_mkldnn)
+            _legacy_C_ops.matmul(input, self.weight, pre_bias, 'transpose_X',
+                                 False, 'transpose_Y', False, "alpha", 1,
+                                 "use_mkldnn", self._use_mkldnn)
             pre_act = dygraph_utils._append_bias_in_dygraph(
                 pre_bias,
                 self.bias,
@@ -1144,12 +1144,13 @@ def __init__(self,
 
     def forward(self, input):
         if in_dygraph_mode():
-            out = _C_ops.final_state_instance_norm(input, self.scale, self.bias,
-                                                   self._epsilon)
+            out = _C_ops.instance_norm(input, self.scale, self.bias,
+                                       self._epsilon)
             return out
         if _in_legacy_dygraph():
-            out, _, _ = _C_ops.instance_norm(input, self.scale, self.bias,
-                                             'epsilon', self._epsilon)
+            out, _, _ = _legacy_C_ops.instance_norm(input, self.scale,
+                                                    self.bias, 'epsilon',
+                                                    self._epsilon)
             return out
 
         check_variable_and_dtype(input, 'input', ['float32', 'float64'],
@@ -1360,7 +1361,7 @@ def forward(self, input):
 
         if _non_static_mode():
             if in_dygraph_mode():
-                batch_norm_out, t1, t2, t3, t4, _ = _C_ops.final_state_batch_norm(
+                batch_norm_out, t1, t2, t3, t4, _ = _C_ops.batch_norm(
                     input, self.weight, self.bias, self._mean, self._variance,
                     self._momentum, self._epsilon, self._data_layout,
                     not self.training, self._use_global_stats,
@@ -1375,7 +1376,7 @@ def forward(self, input):
                          "fuse_with_relu", self._fuse_with_relu,
                          "use_global_stats", self._use_global_stats,
                          'trainable_statistics', self._trainable_statistics)
-                batch_norm_out, _, _, _, _, _ = _C_ops.batch_norm(
+                batch_norm_out, _, _, _, _, _ = _legacy_C_ops.batch_norm(
                     input, self.weight, self.bias, self._mean, self._variance,
                     None, mean_out, variance_out, *attrs)
 
@@ -1529,7 +1530,7 @@ def forward(self, input):
 
         if _non_static_mode():
             attrs = sum(attrs.items(), ())
-            out, mask = _C_ops.dropout(input, *attrs)
+            out, mask = _legacy_C_ops.dropout(input, *attrs)
             return out
 
         out = self._helper.create_variable_for_type_inference(dtype=input.dtype)
@@ -1681,12 +1682,10 @@ def __init__(self,
 
     def forward(self, input):
         if _non_static_mode():
-            return _C_ops.lookup_table_v2(self.weight, input, 'is_sparse',
-                                          self._is_sparse, 'is_distributed',
-                                          self._is_distributed,
-                                          'remote_prefetch',
-                                          self._remote_prefetch, 'padding_idx',
-                                          self._padding_idx)
+            return _legacy_C_ops.lookup_table_v2(
+                self.weight, input, 'is_sparse', self._is_sparse,
+                'is_distributed', self._is_distributed, 'remote_prefetch',
+                self._remote_prefetch, 'padding_idx', self._padding_idx)
 
         check_variable_and_dtype(input, 'input',
                                  ['uint8', 'int8', 'int16', 'int32', 'int64'],
@@ -1841,16 +1840,15 @@ def forward(self, input):
 
         if _non_static_mode():
             if in_dygraph_mode():
-                pre_act, _, _, = _C_ops.final_state_layer_norm(
-                    input, self.weight, self.bias, self._epsilon,
-                    self._begin_norm_axis, False)
+                pre_act, _, _, = _C_ops.layer_norm(input, self.weight,
+                                                   self.bias, self._epsilon,
+                                                   self._begin_norm_axis, False)
                 return dygraph_utils._append_activation_in_dygraph(
                     pre_act, act=self._act)
             else:
-                pre_act, _, _ = _C_ops.layer_norm(input, self.weight, self.bias,
-                                                  'epsilon', self._epsilon,
-                                                  'begin_norm_axis',
-                                                  self._begin_norm_axis)
+                pre_act, _, _ = _legacy_C_ops.layer_norm(
+                    input, self.weight, self.bias, 'epsilon', self._epsilon,
+                    'begin_norm_axis', self._begin_norm_axis)
                 return dygraph_utils._append_activation_in_dygraph(
                     pre_act, act=self._act)
 
@@ -2036,7 +2034,7 @@ def __init__(self,
 
     def forward(self, input, hidden):
         if _non_static_mode():
-            gate, reset_hidden_pre, updated_hidden = _C_ops.gru_unit(
+            gate, reset_hidden_pre, updated_hidden = _legacy_C_ops.gru_unit(
                 input, hidden, self.weight, self.bias, 'activation',
                 self.activation, 'gate_activation', self.gate_activation)
             return updated_hidden, reset_hidden_pre, gate
@@ -2286,12 +2284,12 @@ def forward(self, input, label, sample_weight=None):
                      self._attrs['seed'], 'sampler', self._attrs['sampler'],
                      'is_sparse', self._attrs['is_sparse'], 'remote_prefetch',
                      self._attrs['remote_prefetch'])
-            cost, _, _ = _C_ops.nce(input, label, self.weight, self.bias,
-                                    self._inputs['SampleWeight'],
-                                    self._inputs['CustomDistProbs'],
-                                    self._inputs['CustomDistAlias'],
-                                    self._inputs['CustomDistAliasProbs'],
-                                    *attrs)
+            cost, _, _ = _legacy_C_ops.nce(input, label, self.weight, self.bias,
+                                           self._inputs['SampleWeight'],
+                                           self._inputs['CustomDistProbs'],
+                                           self._inputs['CustomDistAlias'],
+                                           self._inputs['CustomDistAliasProbs'],
+                                           *attrs)
             return cost / (self._num_neg_samples + 1)
 
         check_variable_and_dtype(input, "input", ['float32', 'float64'], "NCE")
@@ -2731,7 +2729,7 @@ def __init__(self,
 
     def forward(self, input):
         if _non_static_mode():
-            op = getattr(_C_ops, self._op_type)
+            op = getattr(_legacy_C_ops, self._op_type)
             out = op(input, self.weight, 'output_size', self._output_size,
                      'strides', self._stride, 'paddings', self._padding,
                      'dilations', self._dilation, 'groups', self._groups,
@@ -3032,16 +3030,15 @@ def forward(self, input):
         variance_out = self._helper.create_variable_for_type_inference(
             dtype=self._dtype, stop_gradient=True)
         if in_dygraph_mode():
-            out = _C_ops.final_state_group_norm(input, self.weight, self.bias,
-                                                self._epsilon, self._groups,
-                                                "NCHW")
+            out = _C_ops.group_norm(input, self.weight, self.bias,
+                                    self._epsilon, self._groups, "NCHW")
 
             return dygraph_utils._append_activation_in_dygraph(out, self._act)
 
         elif _in_legacy_dygraph():
             attrs = ('epsilon', self._epsilon, 'groups', self._groups)
-            out, _, _ = _C_ops.group_norm(input, self.weight, self.bias,
-                                          mean_out, variance_out, *attrs)
+            out, _, _ = _legacy_C_ops.group_norm(input, self.weight, self.bias,
+                                                 mean_out, variance_out, *attrs)
 
             return dygraph_utils._append_activation_in_dygraph(out, self._act)
         else:
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index 52b25debaca83..91f22842a4561 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -22,7 +22,7 @@
 from contextlib import contextmanager
 
 import paddle
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid import core
 from paddle.fluid import framework
 from paddle.fluid.dygraph import layers
@@ -346,7 +346,7 @@ def _split_tensors(coalesced_grads_and_grad_vars):
             attrs = ()
             attrs += ('sections', grad_var_len)
             attrs += ('axis', 0)
-            _C_ops.split(coalesced_grad, origin_grad_vars, *attrs)
+            _legacy_C_ops.split(coalesced_grad, origin_grad_vars, *attrs)
             for g_var, g_shape in zip(origin_grad_vars, grad_shapes):
                 g_var.reshape_(shape=g_shape)
                 assert g_var.shape == g_shape
diff --git a/python/paddle/fluid/dygraph/tracer.py b/python/paddle/fluid/dygraph/tracer.py
index 046a98293e832..4627d6d11e74c 100644
--- a/python/paddle/fluid/dygraph/tracer.py
+++ b/python/paddle/fluid/dygraph/tracer.py
@@ -19,11 +19,11 @@
 from collections import defaultdict
 from paddle.fluid import core
 from paddle.fluid import framework
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
-final_state_name_mapping = {
+name_mapping = {
     "graph_send_recv": {
-        "final_op_name": "final_state_graph_send_recv",
+        "final_op_name": "graph_send_recv",
         "x": "X",
         "src_index": "Src_index",
         "dst_index": "Dst_index",
@@ -31,7 +31,7 @@
         "dst_count": "Dst_count"
     },
     "matmul_v2": {
-        "final_op_name": "final_state_matmul",
+        "final_op_name": "matmul",
         "transpose_x": "trans_x",
         "transpose_y": "trans_y",
         "x": "X",
@@ -39,33 +39,33 @@
         "out": "Out",
     },
     # "elementwise_add": {
-    #     "final_op_name": "final_state_add",
+    #     "final_op_name": "add",
     #     "x": "X",
     #     "y": "Y",
     # },
     "trunc": {
-        "final_op_name": "final_state_trunc",
+        "final_op_name": "trunc",
         "x": "X",
         "out": "Out",
     },
     # "pool2d": {
-    #     "final_op_name": "final_state_pool2d",
+    #     "final_op_name": "pool2d",
     #     "x": "X",
     #     "kernel_size": "ksize",
     #     "out": "Out",
     # },
     "abs": {
-        "final_op_name": "final_state_abs",
+        "final_op_name": "abs",
         "x": "X",
         "out": "Out",
     },
     "digamma": {
-        "final_op_name": "final_state_digamma",
+        "final_op_name": "digamma",
         "x": "X",
         "out": "Out",
     },
     "diagonal": {
-        "final_op_name": "final_state_diagonal",
+        "final_op_name": "diagonal",
         "x": "Input",
         "offset": "offset",
         "axis1": "axis1",
@@ -73,7 +73,7 @@
         "out": "Out",
     },
     "roi_align": {
-        "final_op_name": "final_state_roi_align",
+        "final_op_name": "roi_align",
         "x": "X",
         "boxes": "ROIs",
         "boxes_num": "RoisNum",
@@ -84,7 +84,7 @@
         "aligned": "aligned",
     },
     # "one_hot": {
-    #     "final_op_name": "final_state_one_hot",
+    #     "final_op_name": "one_hot",
     #     "x": "X",
     #     "num_class": "depth",
     #     "out": "Out",
@@ -110,22 +110,22 @@ def __init__(self):
 
         self._train_mode = True
 
-    def eager_trace_op(self,
-                       type,
-                       inputs,
-                       outputs,
-                       attrs,
-                       stop_gradient=False,
-                       inplace_map=None):
-        function_ptr = _C_ops.__dict__[type]
+    def eager_legacy_trace_op(self,
+                              op_type,
+                              inputs,
+                              outputs,
+                              attrs,
+                              stop_gradient=False,
+                              inplace_map=None):
+        function_ptr = _legacy_C_ops.__dict__[op_type]
 
-        core_ops_args_info = _C_ops.get_core_ops_args_info()
-        core_ops_args_type_info = _C_ops.get_core_ops_args_type_info()
-        core_ops_returns_info = _C_ops.get_core_ops_returns_info()
+        core_ops_args_info = _legacy_C_ops.get_core_ops_args_info()
+        core_ops_args_type_info = _legacy_C_ops.get_core_ops_args_type_info()
+        core_ops_returns_info = _legacy_C_ops.get_core_ops_returns_info()
 
-        op_args = core_ops_args_info[type]
-        op_args_type = core_ops_args_type_info[type]
-        op_returns = core_ops_returns_info[type]
+        op_args = core_ops_args_info[op_type]
+        op_args_type = core_ops_args_type_info[op_type]
+        op_returns = core_ops_returns_info[op_type]
 
         arg_list = []
         for i in range(len(op_args)):
@@ -175,7 +175,7 @@ def eager_trace_op(self,
             attrs_list.append(v)
         returns = function_ptr(*arg_list, *attrs_list)
 
-        if type == 'load_combine':
+        if op_type == 'load_combine':
             assert len(outputs.keys()) == 1
             key = list(outputs.keys())[0]
             for j in range(len(returns)):
@@ -211,34 +211,33 @@ def eager_trace_op(self,
             else:
                 outputs[key].reconstruct_from_(returns, False)
 
-    def eager_final_state_trace_op(self,
-                                   type,
-                                   inputs,
-                                   outputs,
-                                   attrs,
-                                   stop_gradient=False,
-                                   inplace_map=None):
-        assert type in final_state_name_mapping.keys()
+    def eager_trace_op(self,
+                       op_type,
+                       inputs,
+                       outputs,
+                       attrs,
+                       stop_gradient=False,
+                       inplace_map=None):
+        assert op_type in name_mapping.keys()
 
-        final_state_type = final_state_name_mapping[type]["final_op_name"]
-        function_ptr = _C_ops.__dict__[final_state_type]
+        op_type = name_mapping[op_type]["final_op_name"]
+        function_ptr = _C_ops.__dict__[op_type]
 
-        core_ops_args_info = _C_ops.get_final_state_core_ops_args_info()
-        core_ops_args_type_info = _C_ops.get_final_state_core_ops_args_type_info(
-        )
-        core_ops_returns_info = _C_ops.get_final_state_core_ops_returns_info()
+        core_ops_args_info = _C_ops.get_core_ops_args_info()
+        core_ops_args_type_info = _C_ops.get_core_ops_args_type_info()
+        core_ops_returns_info = _C_ops.get_core_ops_returns_info()
 
-        op_args = core_ops_args_info[final_state_type]
-        op_args_type = core_ops_args_type_info[final_state_type]
-        op_returns = core_ops_returns_info[final_state_type]
+        op_args = core_ops_args_info[op_type]
+        op_args_type = core_ops_args_type_info[op_type]
+        op_returns = core_ops_returns_info[op_type]
 
         arg_list = []
         for i in range(len(op_args)):
             eager_arg_name = op_args[i]
             arg_type = op_args_type[i]
 
-            assert eager_arg_name in final_state_name_mapping[type].keys()
-            arg_name = final_state_name_mapping[type][eager_arg_name]
+            assert eager_arg_name in name_mapping[op_type].keys()
+            arg_name = name_mapping[op_type][eager_arg_name]
 
             if arg_name in inputs.keys():
                 arg_to_append = inputs[arg_name]
@@ -271,8 +270,8 @@ def eager_final_state_trace_op(self,
             for i in range(len(op_returns)):
                 eager_retname = op_returns[i]
 
-                assert eager_retname in final_state_name_mapping[type].keys()
-                retname = final_state_name_mapping[type][eager_retname]
+                assert eager_retname in name_mapping[op_type].keys()
+                retname = name_mapping[op_type][eager_retname]
                 if retname in outputs.keys():
                     # Replaced outputs by function returns
                     if isinstance(returns[i], list):
@@ -304,16 +303,15 @@ def trace_op(self,
         if not framework._in_legacy_dygraph():
             # inputs : {"sum": [tensor], ...}
             # outputs : {"sum": [tensor], ...}
-            if type in final_state_name_mapping.keys():
-                final_state_type = final_state_name_mapping[type][
-                    "final_op_name"]
+            if type in name_mapping.keys():
+                type = name_mapping[type]["final_op_name"]
 
-                assert final_state_type in _C_ops.__dict__
-                self.eager_final_state_trace_op(type, inputs, outputs, attrs,
-                                                stop_gradient, inplace_map)
-            else:
+                assert type in _legacy_C_ops.__dict__
                 self.eager_trace_op(type, inputs, outputs, attrs, stop_gradient,
                                     inplace_map)
+            else:
+                self.eager_legacy_trace_op(type, inputs, outputs, attrs,
+                                           stop_gradient, inplace_map)
         else:
             self.trace(type, inputs, outputs, attrs,
                        framework._current_expected_place(), self._has_grad
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 2b7771554b897..8ad8589525895 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -31,7 +31,7 @@
 import paddle.utils.deprecated as deprecated
 import paddle.profiler as profiler
 from paddle.profiler.utils import in_profiler_mode
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 _grad_scalar = None
 
@@ -818,13 +818,13 @@ def _set_grad_ivar(self, value):
     @framework.dygraph_only
     def clone(self):
         if in_dygraph_mode():
-            return _C_ops.final_state_assign(self)
+            return _C_ops.assign(self)
 
         if _in_legacy_dygraph():
             output = core.VarBase()
         else:
             output = core.eager.Tensor()
-        return _C_ops.assign(self, output)
+        return _legacy_C_ops.assign(self, output)
 
     @framework.dygraph_only
     def value(self):
@@ -925,7 +925,7 @@ def values(self):
         """
 
         if self.is_sparse_coo() or self.is_sparse_csr():
-            return _C_ops.final_state_sparse_values(self)
+            return _C_ops.sparse_values(self)
         else:
             raise ValueError(
                 "only SparseCooTensor and SparseCsrTensor have method values")
@@ -957,9 +957,9 @@ def to_dense(self):
         """
 
         if self.is_sparse_coo():
-            return _C_ops.final_state_sparse_coo_to_dense(self)
+            return _C_ops.sparse_coo_to_dense(self)
         elif self.is_sparse_csr():
-            return _C_ops.final_state_sparse_to_dense(self)
+            return _C_ops.sparse_to_dense(self)
         else:
             return self
 
@@ -988,7 +988,7 @@ def to_sparse_coo(self, sparse_dim):
         """
 
         if self.is_sparse_csr():
-            return _C_ops.final_state_sparse_to_sparse_coo(self, sparse_dim)
+            return _C_ops.sparse_to_sparse_coo(self, sparse_dim)
         elif self.is_sparse_coo():
             return self
         elif self.is_selected_rows():
@@ -996,7 +996,7 @@ def to_sparse_coo(self, sparse_dim):
                 "SelectedRows does not support to_sparse_coo method")
         else:
             #is dense tensor
-            return _C_ops.final_state_sparse_dense_to_coo(self, sparse_dim)
+            return _C_ops.sparse_dense_to_coo(self, sparse_dim)
 
     if framework._in_eager_mode_ and not hasattr(core, "eager"):
         return
diff --git a/python/paddle/fluid/dygraph_utils.py b/python/paddle/fluid/dygraph_utils.py
index d8c19c94f27e5..849191f546302 100644
--- a/python/paddle/fluid/dygraph_utils.py
+++ b/python/paddle/fluid/dygraph_utils.py
@@ -14,7 +14,7 @@
 
 from . import core
 from .framework import dygraph_only, in_dygraph_mode
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 
 @dygraph_only
@@ -41,7 +41,7 @@ def _append_activation_in_dygraph(input,
     if use_mkldnn:
         attrs += ('use_mkldnn', use_mkldnn)
 
-    act_op = getattr(_C_ops, act)
+    act_op = getattr(_legacy_C_ops, act)
     return act_op(input, *attrs)
 
 
@@ -60,5 +60,5 @@ def _append_bias_in_dygraph(input, bias=None, axis=1, use_mkldnn=False):
     if bias is None:
         return input
 
-    return _C_ops.elementwise_add(input, bias, 'axis', axis, 'use_mkldnn',
-                                  use_mkldnn)
+    return _legacy_C_ops.elementwise_add(input, bias, 'axis', axis,
+                                         'use_mkldnn', use_mkldnn)
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 1261eb898a336..fad402cc980e5 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -115,7 +115,7 @@ def _update_monkey_methods(is_eager):
     Update monkey methods of VarBase or eager.Tensor while
     switching eager mode and legacy mode.
     """
-    from paddle import _C_ops
+    from paddle import _C_ops, _legacy_C_ops
     from .dygraph.varbase_patch_methods import monkey_patch_varbase
     from .dygraph import monkey_patch_math_varbase
 
@@ -125,7 +125,7 @@ def _update_monkey_methods(is_eager):
     assert isinstance(is_eager, bool)
     # switch into eager mode
     if is_eager:
-        _C_ops.switch_to_eager_ops()
+        _legacy_C_ops.switch_to_eager_ops()
         if not _already_patch_eager_tensor:
             monkey_patch_varbase()
             monkey_patch_math_varbase()
@@ -133,7 +133,7 @@ def _update_monkey_methods(is_eager):
             _already_patch_eager_tensor = True
     # switch back into legacy mode
     else:
-        _C_ops.switch_to_core_ops()
+        _legacy_C_ops.switch_to_core_ops()
         if not _already_patch_varbase:
             monkey_patch_varbase()
             monkey_patch_math_varbase()
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 3ecb5dc5602be..f2b4af165d61e 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -25,7 +25,7 @@
 from .core import VarDesc
 from . import unique_name
 from .data_feeder import check_variable_and_dtype, check_type, check_dtype
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 import paddle
 
 __all__ = [
@@ -167,14 +167,15 @@ def forward(self, var, block=None):
             place = _current_expected_place()
             if self._force_cpu:
                 place = core.CPUPlace()
-            _C_ops.final_state_full_(var, var.shape, str(float(self._value)),
-                                     var.dtype, place)
+            _C_ops.full_(var, var.shape, str(float(self._value)), var.dtype,
+                         place)
             return None
         elif _in_legacy_dygraph():
-            _C_ops.fill_constant(var, 'value', float(self._value),
-                                 'force_cpu', self._force_cpu, 'dtype',
-                                 int(var.dtype), 'str_value',
-                                 str(float(self._value)), 'shape', var.shape)
+            _legacy_C_ops.fill_constant(var, 'value', float(self._value),
+                                        'force_cpu', self._force_cpu, 'dtype',
+                                        int(var.dtype), 'str_value',
+                                        str(float(self._value)), 'shape',
+                                        var.shape)
             return None
         else:
             op = block.append_op(type="fill_constant",
@@ -274,13 +275,13 @@ def forward(self, var, block=None):
             out_var = var
 
         if framework._non_static_mode():
-            out_var = _C_ops.uniform_random(
+            out_var = _legacy_C_ops.uniform_random(
                 'shape', var.shape, 'min', self._low, 'max', self._high, 'seed',
                 self._seed, 'dtype', out_dtype, 'diag_num', self._diag_num,
                 'diag_step', self._diag_step, 'diag_val', self._diag_val)
             if var.dtype == VarDesc.VarType.FP16:
-                var_tmp = _C_ops.cast(out_var, 'in_dtype', out_var.dtype,
-                                      'out_dtype', var.dtype)
+                var_tmp = _legacy_C_ops.cast(out_var, 'in_dtype', out_var.dtype,
+                                             'out_dtype', var.dtype)
                 var_tmp._share_underline_tensor_to(var)
             else:
                 out_var._share_underline_tensor_to(var)
@@ -378,26 +379,25 @@ def forward(self, var, block=None):
 
         if in_dygraph_mode():
             place = _current_expected_place()
-            out_var = _C_ops.final_state_gaussian_random(
-                var.shape, self._mean, self._std_dev, self._seed, out_dtype,
-                place)
+            out_var = _C_ops.gaussian_random(var.shape, self._mean,
+                                             self._std_dev, self._seed,
+                                             out_dtype, place)
 
             if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
-                var_tmp = _C_ops.final_state_cast(out_var, var.dtype)
+                var_tmp = _C_ops.cast(out_var, var.dtype)
                 var_tmp._share_underline_tensor_to(var)
             else:
                 out_var._share_underline_tensor_to(var)
             return None
 
         if _in_legacy_dygraph():
-            out_var = _C_ops.gaussian_random('shape', var.shape, 'dtype',
-                                             out_dtype, 'mean', self._mean,
-                                             'std', self._std_dev, 'seed',
-                                             self._seed, 'use_mkldnn', False)
+            out_var = _legacy_C_ops.gaussian_random(
+                'shape', var.shape, 'dtype', out_dtype, 'mean', self._mean,
+                'std', self._std_dev, 'seed', self._seed, 'use_mkldnn', False)
 
             if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
-                var_tmp = _C_ops.cast(out_var, 'in_dtype', out_var.dtype,
-                                      'out_dtype', var.dtype)
+                var_tmp = _legacy_C_ops.cast(out_var, 'in_dtype', out_var.dtype,
+                                             'out_dtype', var.dtype)
                 var_tmp._share_underline_tensor_to(var)
             else:
                 out_var._share_underline_tensor_to(var)
@@ -486,25 +486,23 @@ def forward(self, var, block=None):
             out_var = var
 
         if in_dygraph_mode():
-            out_var = _C_ops.final_state_truncated_gaussian_random(
+            out_var = _C_ops.truncated_gaussian_random(
                 var.shape, self._mean, self._std_dev, self._seed, out_dtype,
                 _current_expected_place())
             if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
-                var_tmp = _C_ops.final_state_cast(out_var, var.dtype)
+                var_tmp = _C_ops.cast(out_var, var.dtype)
                 var_tmp._share_underline_tensor_to(var)
             else:
                 out_var._share_underline_tensor_to(var)
             return None
 
         if _in_legacy_dygraph():
-            out_var = _C_ops.truncated_gaussian_random('shape', var.shape,
-                                                       'dtype', out_dtype,
-                                                       'mean', self._mean,
-                                                       'std', self._std_dev,
-                                                       'seed', self._seed)
+            out_var = _legacy_C_ops.truncated_gaussian_random(
+                'shape', var.shape, 'dtype', out_dtype, 'mean', self._mean,
+                'std', self._std_dev, 'seed', self._seed)
             if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
-                var_tmp = _C_ops.cast(out_var, 'in_dtype', out_var.dtype,
-                                      'out_dtype', var.dtype)
+                var_tmp = _legacy_C_ops.cast(out_var, 'in_dtype', out_var.dtype,
+                                             'out_dtype', var.dtype)
                 var_tmp._share_underline_tensor_to(var)
             else:
                 out_var._share_underline_tensor_to(var)
@@ -632,34 +630,34 @@ def forward(self, var, block=None):
             if self._uniform:
                 limit = math.sqrt(6.0 / float(fan_in + fan_out))
                 if in_dygraph_mode():
-                    out_var = _C_ops.final_state_uniform_random(
-                        out_var.shape, out_dtype, -limit, limit, self._seed,
-                        _current_expected_place())
+                    out_var = _C_ops.uniform_random(out_var.shape, out_dtype,
+                                                    -limit, limit, self._seed,
+                                                    _current_expected_place())
                 elif _in_legacy_dygraph():
-                    out_var = _C_ops.uniform_random('shape', out_var.shape,
-                                                    'min', -limit, 'max', limit,
-                                                    'seed', self._seed, 'dtype',
-                                                    out_dtype)
+                    out_var = _legacy_C_ops.uniform_random(
+                        'shape', out_var.shape, 'min', -limit, 'max', limit,
+                        'seed', self._seed, 'dtype', out_dtype)
             else:
                 std = math.sqrt(2.0 / float(fan_in + fan_out))
 
                 if in_dygraph_mode():
                     place = _current_expected_place()
-                    out_var = _C_ops.final_state_gaussian_random(
-                        out_var.shape, 0.0, std, self._seed, out_dtype, place)
+                    out_var = _C_ops.gaussian_random(out_var.shape, 0.0, std,
+                                                     self._seed, out_dtype,
+                                                     place)
                 else:
-                    out_var = _C_ops.gaussian_random('shape', out_var.shape,
-                                                     'dtype', out_dtype, 'mean',
-                                                     0.0, 'std', std, 'seed',
-                                                     self._seed)
+                    out_var = _legacy_C_ops.gaussian_random(
+                        'shape', out_var.shape, 'dtype', out_dtype, 'mean', 0.0,
+                        'std', std, 'seed', self._seed)
 
             if var.dtype == VarDesc.VarType.FP16 or (
                     var.dtype == VarDesc.VarType.BF16 and not self._uniform):
                 if in_dygraph_mode():
-                    var_tmp = _C_ops.final_state_cast(out_var, var.dtype)
+                    var_tmp = _C_ops.cast(out_var, var.dtype)
                 elif _in_legacy_dygraph():
-                    var_tmp = _C_ops.cast(out_var, 'in_dtype', out_var.dtype,
-                                          'out_dtype', var.dtype)
+                    var_tmp = _legacy_C_ops.cast(out_var, 'in_dtype',
+                                                 out_var.dtype, 'out_dtype',
+                                                 var.dtype)
                 var_tmp._share_underline_tensor_to(var)
             else:
                 out_var._share_underline_tensor_to(var)
@@ -807,28 +805,28 @@ def forward(self, var, block=None):
                 gain = calculate_gain(self._nonlinearity, self._negative_slope)
                 limit = gain * math.sqrt(3.0 / float(fan_in))
 
-                out_var = _C_ops.uniform_random('shape', out_var.shape, 'min',
-                                                -limit, 'max', limit, 'seed',
-                                                self._seed, 'dtype',
-                                                int(out_dtype))
+                out_var = _legacy_C_ops.uniform_random('shape', out_var.shape,
+                                                       'min', -limit, 'max',
+                                                       limit, 'seed',
+                                                       self._seed, 'dtype',
+                                                       int(out_dtype))
             else:
                 gain = calculate_gain(self._nonlinearity, self._negative_slope)
                 std = gain / math.sqrt(float(fan_in))
                 if in_dygraph_mode():
                     place = _current_expected_place()
-                    out_var = _C_ops.final_state_gaussian_random(
-                        out_var.shape, 0.0, std, self._seed, out_dtype, place)
+                    out_var = _C_ops.gaussian_random(out_var.shape, 0.0, std,
+                                                     self._seed, out_dtype,
+                                                     place)
                 else:
-                    out_var = _C_ops.gaussian_random('shape',
-                                                     out_var.shape, 'dtype',
-                                                     int(out_dtype), 'mean',
-                                                     0.0, 'std', std, 'seed',
-                                                     self._seed)
+                    out_var = _legacy_C_ops.gaussian_random(
+                        'shape', out_var.shape, 'dtype', int(out_dtype), 'mean',
+                        0.0, 'std', std, 'seed', self._seed)
 
             if var.dtype == VarDesc.VarType.FP16 or (
                     var.dtype == VarDesc.VarType.BF16 and not self._uniform):
-                var_tmp = _C_ops.cast(out_var, 'in_dtype', out_var.dtype,
-                                      'out_dtype', var.dtype)
+                var_tmp = _legacy_C_ops.cast(out_var, 'in_dtype', out_var.dtype,
+                                             'out_dtype', var.dtype)
                 var_tmp._share_underline_tensor_to(var)
             else:
                 out_var._share_underline_tensor_to(var)
@@ -988,14 +986,14 @@ def forward(self, var, block=None):
             raise ValueError("The size of input is too big. ")
 
         if framework._non_static_mode():
-            _C_ops.assign_value(out_var, 'shape', list(shape), 'dtype',
-                                out_dtype, value_name, values)
+            _legacy_C_ops.assign_value(out_var, 'shape', list(shape), 'dtype',
+                                       out_dtype, value_name, values)
             if var.dtype in [
                     VarDesc.VarType.FP16, VarDesc.VarType.BF16,
                     VarDesc.VarType.FP64
             ]:
-                var_tmp = _C_ops.cast(out_var, 'in_dtype', out_var.dtype,
-                                      'out_dtype', var.dtype)
+                var_tmp = _legacy_C_ops.cast(out_var, 'in_dtype', out_var.dtype,
+                                             'out_dtype', var.dtype)
                 var_tmp._share_underline_tensor_to(var)
             else:
                 out_var._share_underline_tensor_to(var)
@@ -1095,11 +1093,12 @@ def forward(self, var, block=None):
                              "saving it to file and 'load_op' to load it")
 
         if framework._non_static_mode():
-            _C_ops.assign_value(out_var, 'shape', list(self._value.shape),
-                                'dtype', out_dtype, value_name, values)
+            _legacy_C_ops.assign_value(out_var, 'shape',
+                                       list(self._value.shape), 'dtype',
+                                       out_dtype, value_name, values)
             if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
-                var_tmp = _C_ops.cast(out_var, 'in_dtype', out_var.dtype,
-                                      'out_dtype', var.dtype)
+                var_tmp = _legacy_C_ops.cast(out_var, 'in_dtype', out_var.dtype,
+                                             'out_dtype', var.dtype)
                 var_tmp._share_underline_tensor_to(var)
             else:
                 out_var._share_underline_tensor_to(var)
diff --git a/python/paddle/fluid/layers/collective.py b/python/paddle/fluid/layers/collective.py
index b0e285e036ebc..0d62bb17ea4d6 100644
--- a/python/paddle/fluid/layers/collective.py
+++ b/python/paddle/fluid/layers/collective.py
@@ -16,7 +16,7 @@
 from ..layer_helper import LayerHelper, unique_name
 from ..framework import Variable, in_dygraph_mode, _in_legacy_dygraph
 import paddle
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 
 def _allreduce(x, out=None, reduce_type="sum", sync_mode=False):
@@ -124,7 +124,7 @@ def _c_allgather(x, nranks, ring_id=0, use_calc_stream=False):
     if _in_legacy_dygraph():
         attrs = ('nranks', nranks, 'ring_id', ring_id, 'use_calc_stream',
                  use_calc_stream)
-        return _C_ops.c_allgather(x, *attrs)
+        return _legacy_C_ops.c_allgather(x, *attrs)
 
     helper = LayerHelper(op_type, **locals())
     out_shape = list(x.shape[:])
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 43781665dc3ca..a5a04e6582dd9 100755
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -29,7 +29,7 @@
 from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 from ... import compat as cpt
 from ..backward import _infer_var_data_type_shape_
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 __all__ = [
     'While', 'Switch', 'increment', 'array_write', 'create_array', 'less_than',
@@ -89,9 +89,9 @@ def select_input(inputs, mask):
     check_type(inputs, 'inputs', (list, tuple), 'select_input')
     check_variable_and_dtype(mask, 'mask', ['int32'], 'select_input')
 
-    input_dtype = inputs[0].dtype
-    input_shape = inputs[0].shape
-    input_type = inputs[0].type
+    input_dtype = inputs[1].dtype
+    input_shape = inputs[1].shape
+    input_type = inputs[1].type
 
     out = helper.create_variable(dtype=input_dtype,
                                  shape=input_shape,
@@ -1190,6 +1190,13 @@ def assign_skip_lod_tensor_array(input, output):
     """
     Assign input to output, but skip the process of copying LoDTensorArray unless it's created in while_block.
     """
+
+    def has_shape_diff(x_var, y_var):
+        if len(x_var.shape) != len(y_var.shape): return True
+        for x_dim, y_dim in zip(x_var.shape, y_var.shape):
+            if x_dim != y_dim and -1 not in [x_dim, y_dim]: return True
+        return False
+
     if not isinstance(input, (Variable, core.VarBase)):
         if isinstance(output, Variable) and isinstance(
                 input, support_ret_buildin_type):
@@ -1205,6 +1212,11 @@ def assign_skip_lod_tensor_array(input, output):
         if parent_block and not parent_block._find_var_recursive(input.name):
             assign(input, output)
     else:
+        if isinstance(output, Variable) and isinstance(
+                input, Variable) and has_shape_diff(input, output):
+            warnings.warn(
+                "In dy2static mode, we attemp to assign a variable with shape {} into a variable with shape{}, which is not always right."
+                .format(input.shape, output.shape))
         assign(input, output)
 
 
@@ -1555,7 +1567,7 @@ def increment(x, value=1.0, in_place=True):
           fluid.layers.increment(counter) # [1.]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_increment_(x, value)
+        return _C_ops.increment_(x, value)
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
                              'increment')
@@ -1879,7 +1891,7 @@ def greater_than(x, y, cond=None, name=None):
     attrs = dict()
 
     if in_dygraph_mode():
-        return _C_ops.final_state_greater_than(x, y, -1)
+        return _C_ops.greater_than(x, y, -1)
     else:
         helper.append_op(type='greater_than',
                          inputs={
@@ -1978,7 +1990,7 @@ def equal(x, y, cond=None, name=None):
     """
     if in_dygraph_mode():
         default_axis = -1
-        return _C_ops.final_state_equal(x, y, default_axis)
+        return _C_ops.equal(x, y, default_axis)
 
     check_variable_and_dtype(x, "x", ["float32", "float64", "int32", "int64"],
                              "equal")
@@ -4038,9 +4050,9 @@ def is_empty(x, name=None):
 
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_is_empty(x)
-    if _in_legacy_dygraph():
         return _C_ops.is_empty(x)
+    if _in_legacy_dygraph():
+        return _legacy_C_ops.is_empty(x)
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
                              'is_empty')
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 4c8d219b2760b..3300a9fc4920b 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -36,7 +36,7 @@
 from functools import reduce
 from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 from paddle.utils import deprecated
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from ..framework import in_dygraph_mode
 
 __all__ = [
@@ -951,16 +951,13 @@ def box_coder(prior_box,
                              'box_coder')
     if in_dygraph_mode():
         if isinstance(prior_box_var, Variable):
-            box_coder_op = _C_ops.final_state_box_coder(prior_box,
-                                                        prior_box_var,
-                                                        target_box, code_type,
-                                                        box_normalized, axis,
-                                                        [])
+            box_coder_op = _C_ops.box_coder(prior_box, prior_box_var,
+                                            target_box, code_type,
+                                            box_normalized, axis, [])
         elif isinstance(prior_box_var, list):
-            box_coder_op = _C_ops.final_state_box_coder(prior_box, None,
-                                                        target_box, code_type,
-                                                        box_normalized, axis,
-                                                        prior_box_var)
+            box_coder_op = _C_ops.box_coder(prior_box, None, target_box,
+                                            code_type, box_normalized, axis,
+                                            prior_box_var)
         else:
             raise TypeError(
                 "Input variance of box_coder must be Variable or lisz")
@@ -1121,7 +1118,8 @@ def yolov3_loss(x,
                  class_num, "ignore_thresh", ignore_thresh, "downsample_ratio",
                  downsample_ratio, "use_label_smooth", use_label_smooth,
                  "scale_x_y", scale_x_y)
-        loss, _, _ = _C_ops.yolov3_loss(x, gt_box, gt_label, gt_score, *attrs)
+        loss, _, _ = _legacy_C_ops.yolov3_loss(x, gt_box, gt_label, gt_score,
+                                               *attrs)
         return loss
 
     helper = LayerHelper('yolov3_loss', **locals())
@@ -1912,10 +1910,9 @@ def prior_box(
         step_w, step_h = steps
         if max_sizes == None:
             max_sizes = []
-        return _C_ops.final_state_prior_box(input, image, min_sizes,
-                                            aspect_ratios, variance, max_sizes,
-                                            flip, clip, step_w, step_h, offset,
-                                            min_max_aspect_ratios_order)
+        return _C_ops.prior_box(input, image, min_sizes, aspect_ratios,
+                                variance, max_sizes, flip, clip, step_w, step_h,
+                                offset, min_max_aspect_ratios_order)
     helper = LayerHelper("prior_box", **locals())
     dtype = helper.input_dtype()
     check_variable_and_dtype(input, 'input',
@@ -3646,7 +3643,7 @@ def matrix_nms(bboxes,
         attrs = (score_threshold, nms_top_k, keep_top_k, post_threshold,
                  use_gaussian, gaussian_sigma, background_label, normalized)
 
-        out, index = _C_ops.final_state_matrix_nms(bboxes, scores, *attrs)
+        out, index = _C_ops.matrix_nms(bboxes, scores, *attrs)
         if return_index:
             return out, index
         else:
@@ -3930,7 +3927,7 @@ def collect_fpn_proposals(multi_rois,
     if _non_static_mode():
         assert rois_num_per_level is not None, "rois_num_per_level should not be None in dygraph mode."
         attrs = ('post_nms_topN', post_nms_top_n)
-        output_rois, rois_num = _C_ops.collect_fpn_proposals(
+        output_rois, rois_num = _legacy_C_ops.collect_fpn_proposals(
             input_rois, input_scores, rois_num_per_level, *attrs)
 
     check_type(multi_rois, 'multi_rois', list, 'collect_fpn_proposals')
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index 4fe9cbb087412..65ce37157c2bc 100755
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -24,7 +24,7 @@
 from ..layer_helper import LayerHelper
 from ..data_feeder import check_variable_and_dtype
 from paddle.fluid.framework import in_dygraph_mode, _in_legacy_dygraph
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 __all__ = [
     'generate_layer_fn', 'generate_activation_fn', 'generate_inplace_fn',
@@ -260,14 +260,13 @@ def generate_activation_fn(op_type):
     op_proto = OpProtoHolder.instance().get_op_proto(op_type)
 
     def func(x, name=None):
-        final_state_op_type = "final_state_%s" % op_type
-        if in_dygraph_mode() and hasattr(_C_ops, final_state_op_type):
-            op = getattr(_C_ops, final_state_op_type)
+        if in_dygraph_mode() and hasattr(_C_ops, op_type):
+            op = getattr(_C_ops, op_type)
             return op(x)
         # TODO(dev): Because some ops' yaml has not been migrated.
         # Replace it with _in_legacy_dygraph while all yaml work is done.
         if _non_static_mode():
-            op = getattr(_C_ops, op_type)
+            op = getattr(_legacy_C_ops, op_type)
             return op(x)
 
         if op_type not in ["abs", "exp", "square"]:
@@ -308,7 +307,7 @@ def generate_inplace_fn(inplace_op_type):
 
     def func(x, name=None):
         if _non_static_mode():
-            op = getattr(_C_ops, inplace_op_type)
+            op = getattr(_legacy_C_ops, inplace_op_type)
             return op(x)
         warnings.warn(
             "In static mode, {}() is the same as {}() and does not perform inplace operation."
diff --git a/python/paddle/fluid/layers/loss.py b/python/paddle/fluid/layers/loss.py
index 61ab466c24362..20c198388a9b4 100644
--- a/python/paddle/fluid/layers/loss.py
+++ b/python/paddle/fluid/layers/loss.py
@@ -28,7 +28,7 @@
 from ..initializer import NumpyArrayInitializer, Constant
 from .. import core
 import warnings
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 __all__ = [
     'center_loss',
@@ -266,8 +266,9 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
         return cross_entropy2(input, label, ignore_index)
 
     if _non_static_mode():
-        return _C_ops.cross_entropy(input, label, "soft_label", soft_label,
-                                    "ignore_index", ignore_index)
+        return _legacy_C_ops.cross_entropy(input, label, "soft_label",
+                                           soft_label, "ignore_index",
+                                           ignore_index)
 
     inputs = {'X': [input], 'Label': [label]}
     attrs = {"soft_label": soft_label, "ignore_index": ignore_index}
@@ -285,8 +286,8 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
 
 def cross_entropy2(input, label, ignore_index=kIgnoreIndex):
     if _non_static_mode():
-        loss, _, _ = _C_ops.cross_entropy2(input, label, 'ignore_index',
-                                           ignore_index)
+        loss, _, _ = _legacy_C_ops.cross_entropy2(input, label, 'ignore_index',
+                                                  ignore_index)
         return loss
 
     inputs = {'X': [input], 'Label': [label]}
@@ -549,16 +550,15 @@ def warpctc(input,
             raise ValueError(
                 "input_length and label_length must not be None in dygraph mode!"
             )
-        loss_out = _C_ops.final_state_warpctc(input, label, input_length,
-                                              label_length, blank,
-                                              norm_by_times)
+        loss_out = _C_ops.warpctc(input, label, input_length, label_length,
+                                  blank, norm_by_times)
         return loss_out
     if _non_static_mode():
         if input_length is None or label_length is None:
             raise ValueError(
                 "input_length and label_length must not be None in dygraph mode!"
             )
-        grad, loss_out = _C_ops.warpctc(
+        grad, loss_out = _legacy_C_ops.warpctc(
             input,
             label,
             input_length,
@@ -1058,16 +1058,16 @@ def sampled_softmax_with_cross_entropy(logits,
                                'uniq', True, 'remove_accidental_hits',
                                remove_accidental_hits, 'num_samples',
                                num_samples, 'seed', seed)
-        _, _, _, _, sampled_logits_out, sampled_label_out = _C_ops.sample_logits(
+        _, _, _, _, sampled_logits_out, sampled_label_out = _legacy_C_ops.sample_logits(
             logits, label, *sample_logits_attrs)
         depth = num_samples + 1
-        sampled_softlabel_out = _C_ops.one_hot(sampled_label_out, 'depth',
-                                               depth)
+        sampled_softlabel_out = _legacy_C_ops.one_hot(sampled_label_out,
+                                                      'depth', depth)
 
         softmax_with_cross_entropy_attrs = ('soft_label', True,
                                             'numeric_stable_mode', False)
 
-        _, loss = _C_ops.softmax_with_cross_entropy(
+        _, loss = _legacy_C_ops.softmax_with_cross_entropy(
             sampled_logits_out, sampled_softlabel_out,
             *softmax_with_cross_entropy_attrs)
         return loss / num_true
@@ -1280,7 +1280,7 @@ def identity_loss(x, reduction="none"):
             raise Exception("Unsupported reduction type.")
 
     if _non_static_mode():
-        return _C_ops.identity_loss(x, "reduction", reduction)
+        return _legacy_C_ops.identity_loss(x, "reduction", reduction)
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], "identity_loss")
     attrs = {'reduction': reduction}
@@ -1455,8 +1455,8 @@ def sigmoid_cross_entropy_with_logits(x,
     """
 
     if in_dygraph_mode():
-        return _C_ops.final_state_sigmoid_cross_entropy_with_logits(
-            x, label, normalize, int(ignore_index))
+        return _C_ops.sigmoid_cross_entropy_with_logits(x, label, normalize,
+                                                        int(ignore_index))
     check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
                              'sigmoid_cross_entropy_with_logits')
 
@@ -1585,7 +1585,7 @@ def huber_loss(input, label, delta):
         print(HuberLoss)  #[[1.5], [0.5], [0.5], [0. ]], dtype=float32
     """
     if in_dygraph_mode():
-        out, residual = _C_ops.final_state_huber_loss(input, label, delta)
+        out, residual = _C_ops.huber_loss(input, label, delta)
         return out
 
     helper = LayerHelper('huber_loss', **locals())
diff --git a/python/paddle/fluid/layers/metric_op.py b/python/paddle/fluid/layers/metric_op.py
index f4bf01ce9e373..736213340e902 100755
--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
@@ -26,7 +26,7 @@
 from . import nn
 from . import tensor
 from ..data_feeder import check_variable_and_dtype
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 __all__ = ['accuracy', 'auc']
 
@@ -76,10 +76,10 @@ def accuracy(input, label, k=1, correct=None, total=None):
             total = _varbase_creator(dtype="int32")
 
         _k = k.numpy().item(0) if isinstance(k, Variable) else k
-        topk_out, topk_indices = _C_ops.top_k_v2(input, 'k', _k, 'sorted',
-                                                 False)
-        _acc, _, _ = _C_ops.accuracy(topk_out, topk_indices, label, correct,
-                                     total)
+        topk_out, topk_indices = _legacy_C_ops.top_k_v2(input, 'k', _k,
+                                                        'sorted', False)
+        _acc, _, _ = _legacy_C_ops.accuracy(topk_out, topk_indices, label,
+                                            correct, total)
         return _acc
 
     helper = LayerHelper("accuracy", **locals())
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 60f065530b528..48e6924660a46 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -41,7 +41,7 @@
 from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 import paddle
 from paddle.utils import deprecated
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 __all__ = [
     'fc',
@@ -197,15 +197,15 @@
 ]
 
 OP_NAMEMAPPING = {
-    'elementwise_max': 'final_state_maximum',
-    'elementwise_min': 'final_state_minimum',
-    'elementwise_pow': 'final_state_elementwise_pow',
-    'elementwise_floordiv': 'final_state_floor_divide',
-    'elementwise_add': 'final_state_add',
-    'elementwise_sub': 'final_state_subtract',
-    'elementwise_mul': 'final_state_multiply',
-    'elementwise_div': 'final_state_divide',
-    'elementwise_mod': 'final_state_modulo',
+    'elementwise_max': 'maximum',
+    'elementwise_min': 'minimum',
+    'elementwise_pow': 'elementwise_pow',
+    'elementwise_floordiv': 'floor_divide',
+    'elementwise_add': 'add',
+    'elementwise_sub': 'subtract',
+    'elementwise_mul': 'multiply',
+    'elementwise_div': 'divide',
+    'elementwise_mod': 'modulo',
 }
 
 
@@ -221,7 +221,7 @@ def is_inplace(op_name):
         return op_name[-1] == "_"
 
     if op_name not in OP_NAMEMAPPING.keys() or axis != -1:
-        op = getattr(_C_ops, op_name)
+        op = getattr(_legacy_C_ops, op_name)
         out = op(x, y, 'axis', axis, 'use_mkldnn', use_mkldnn)
     else:
         if in_dygraph_mode():
@@ -231,7 +231,7 @@ def is_inplace(op_name):
             out = op(x, y)
 
         if _in_legacy_dygraph():
-            op = getattr(_C_ops, op_name)
+            op = getattr(_legacy_C_ops, op_name)
             out = op(x, y, 'axis', axis, 'use_mkldnn', use_mkldnn)
     return dygraph_utils._append_activation_in_dygraph(out,
                                                        act,
@@ -1146,11 +1146,12 @@ def dropout(x,
             seed = default_main_program().random_seed
         if is_test is None:
             is_test = not _dygraph_tracer()._train_mode
-        out, mask = _C_ops.dropout(x, 'dropout_prob', dropout_prob, 'is_test',
-                                   is_test, 'fix_seed', seed is not None,
-                                   'seed', seed if seed is not None else 0,
-                                   'dropout_implementation',
-                                   dropout_implementation)
+        out, mask = _legacy_C_ops.dropout(x, 'dropout_prob', dropout_prob,
+                                          'is_test', is_test, 'fix_seed', seed
+                                          is not None, 'seed',
+                                          seed if seed is not None else 0,
+                                          'dropout_implementation',
+                                          dropout_implementation)
         return out
 
     def get_attrs(prog, dropout_prob, is_test, seed):
@@ -1455,10 +1456,11 @@ def softmax(input, use_cudnn=True, name=None, axis=-1):
     """
 
     if in_dygraph_mode():
-        return _C_ops.final_state_softmax(input, axis)
+        return _C_ops.softmax(input, axis)
 
     if _non_static_mode():
-        return _C_ops.softmax(input, 'axis', axis, 'use_cudnn', use_cudnn)
+        return _legacy_C_ops.softmax(input, 'axis', axis, 'use_cudnn',
+                                     use_cudnn)
 
     inputs = {"X": [input]}
     attrs = {"axis": axis, "use_cudnn": use_cudnn}
@@ -2264,10 +2266,9 @@ def is_list_or_tuple(ele):
 
     pool_padding = update_padding(pool_padding, data_format)
     if in_dygraph_mode():
-        return _C_ops.final_state_pool2d(input, pool_size, pool_stride,
-                                         pool_padding, ceil_mode, exclusive,
-                                         data_format, pool_type, global_pooling,
-                                         False, padding_algorithm)
+        return _C_ops.pool2d(input, pool_size, pool_stride, pool_padding,
+                             ceil_mode, exclusive, data_format, pool_type,
+                             global_pooling, False, padding_algorithm)
     op_type = 'pool2d'
     helper = LayerHelper(op_type, **locals())
     dtype = helper.input_dtype()
@@ -3029,11 +3030,11 @@ def batch_norm(input,
                       data_layout, 'use_mkldnn', False, 'fuse_with_relu', False,
                       'use_global_stats', use_global_stats)
         if inputs_has_MomemtumTensor:
-            batch_norm_out, _, _, _, _, _ = _C_ops.batch_norm(
+            batch_norm_out, _, _, _, _, _ = _legacy_C_ops.batch_norm(
                 input, scale, bias, mean, variance, momentum, mean_out,
                 variance_out, *attrs_)
         else:
-            batch_norm_out, _, _, _, _, _ = _C_ops.batch_norm(
+            batch_norm_out, _, _, _, _, _ = _legacy_C_ops.batch_norm(
                 input, scale, bias, mean, variance, None, mean_out,
                 variance_out, *attrs_)
 
@@ -3263,12 +3264,12 @@ def inplace_abn(input,
                        False, 'use_global_stats', use_global_stats,
                        'activation', act, 'alpha', act_alpha)
         if inputs_has_MomemtumTensor:
-            batch_norm_out, _, _, _, _, _ = _C_ops.inplace_abn_(
+            batch_norm_out, _, _, _, _, _ = _legacy_C_ops.inplace_abn_(
                 input, scale, bias, mean, variance, momentum, mean_out,
                 variance_out, *attrs__)
             return batch_norm_out
         else:
-            batch_norm_out, _, _, _, _, _ = _C_ops.inplace_abn_(
+            batch_norm_out, _, _, _, _, _ = _legacy_C_ops.inplace_abn_(
                 input, scale, bias, mean, variance, None, mean_out,
                 variance_out, *attrs__)
             return batch_norm_out
@@ -4665,15 +4666,15 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
             input.shape) else False
         dim = dim if dim != None and dim != [] else [0]
         if reduce_all:
-            return _C_ops.final_state_sum(input, [], None, keep_dim)
+            return _C_ops.sum(input, [], None, keep_dim)
         else:
-            return _C_ops.final_state_sum(input, dim, None, keep_dim)
+            return _C_ops.sum(input, dim, None, keep_dim)
     elif _in_legacy_dygraph():
         reduce_all = True if dim == None or dim == [] or len(dim) == len(
             input.shape) else False
         dim = dim if dim != None and dim != [] else [0]
-        return _C_ops.reduce_sum(input, 'dim', dim, 'keep_dim', keep_dim,
-                                 'reduce_all', reduce_all)
+        return _legacy_C_ops.reduce_sum(input, 'dim', dim, 'keep_dim', keep_dim,
+                                        'reduce_all', reduce_all)
     attrs = {
         'dim':
         dim if dim != None and dim != [] else [0],
@@ -4947,7 +4948,7 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None):
                 "The type of axis must be int, list or tuple, but received {}".
                 format(type(dim)))
     if in_dygraph_mode():
-        return _C_ops.final_state_reduce_prod(
+        return _C_ops.reduce_prod(
             input, dim if dim != None and dim != [] else [0], keep_dim, True if
             dim == None or dim == [] or len(dim) == len(input.shape) else False)
 
@@ -5179,10 +5180,10 @@ def split(input, num_or_sections, dim=-1, name=None):
                 "The type of 'num_or_sections' in split must be int, list or tuple in imperative mode, but "
                 "received %s." % (type(num_or_sections)))
         if in_dygraph_mode():
-            return _C_ops.final_state_split(input, [num], dim)
+            return _C_ops.split(input, [num], dim)
         elif _in_legacy_dygraph():
             out = [_varbase_creator() for n in range(num)]
-            _C_ops.split(input, out, *attrs)
+            _legacy_C_ops.split(input, out, *attrs)
             return out
 
     check_variable_and_dtype(
@@ -5306,8 +5307,8 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
     if len(x.shape) == 1:
         axis = 0
     if _non_static_mode():
-        _, out = _C_ops.norm(x, 'axis', 1 if axis is None else axis, 'epsilon',
-                             epsilon)
+        _, out = _legacy_C_ops.norm(x, 'axis', 1 if axis is None else axis,
+                                    'epsilon', epsilon)
         return out
 
     check_variable_and_dtype(x, "X", ("float16", "float32", "float64"), "norm")
@@ -5405,8 +5406,8 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
     """
     if _non_static_mode():
         out = _varbase_creator(dtype=x.dtype)
-        _C_ops.matmul(x, y, out, 'transpose_X', transpose_x, 'transpose_Y',
-                      transpose_y, 'alpha', float(alpha))
+        _legacy_C_ops.matmul(x, y, out, 'transpose_X', transpose_x,
+                             'transpose_Y', transpose_y, 'alpha', float(alpha))
         return out
 
     def __check_input(x, y):
@@ -5539,7 +5540,7 @@ def topk(input, k, name=None):
     """
     if _non_static_mode():
         _k = k.numpy().item(0) if isinstance(k, Variable) else k
-        out, indices = _C_ops.top_k(input, 'k', _k)
+        out, indices = _legacy_C_ops.top_k(input, 'k', _k)
         out.stop_gradient = True
         indices.stop_gradient = True
         return out, indices
@@ -5787,10 +5788,10 @@ def transpose(x, perm, name=None):
 
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_transpose(x, perm)
+        return _C_ops.transpose(x, perm)
     else:
         if _in_legacy_dygraph():
-            out, _ = _C_ops.transpose2(x, 'axis', perm)
+            out, _ = _legacy_C_ops.transpose2(x, 'axis', perm)
             return out
 
     check_variable_and_dtype(x, 'x', [
@@ -6079,9 +6080,9 @@ def multiplex(inputs, index, name=None):
     """
 
     if _in_legacy_dygraph():
-        return _C_ops.multiplex(index, inputs)
+        return _legacy_C_ops.multiplex(index, inputs)
     if in_dygraph_mode():
-        return _C_ops.final_state_multiplex(inputs, index)
+        return _C_ops.multiplex(inputs, index)
     helper = LayerHelper('multiplex', **locals())
 
     check_type(inputs, 'inputs', (list), 'multiplex')
@@ -6272,8 +6273,8 @@ def one_hot(input, depth, allow_out_of_range=False):
             assert depth.shape == (
                 1, ), "depth of type Variable should have shape [1]"
             depth = depth.item(0)
-        out = _C_ops.one_hot(input, 'depth', depth, 'allow_out_of_range',
-                             allow_out_of_range)
+        out = _legacy_C_ops.one_hot(input, 'depth', depth, 'allow_out_of_range',
+                                    allow_out_of_range)
         out.stop_gradient = True
         return out
 
@@ -6454,11 +6455,11 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
                 item.numpy().item(0) if isinstance(item, Variable) else item
                 for item in shape
             ]
-            out = _C_ops.final_state_reshape(x, shape)
+            out = _C_ops.reshape(x, shape)
         elif isinstance(shape, tmp_tensor_type):
-            # TODO: Tensor shape in final_state_reshape has not been tested
+            # TODO: Tensor shape in reshape has not been tested
             shape.stop_gradient = True
-            out = _C_ops.final_state_reshape(x, shape)
+            out = _C_ops.reshape(x, shape)
         else:
             raise ValueError(
                 "shape must be an instance of `list`, `tuple` or `Variable`,"
@@ -6477,10 +6478,10 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
                     item.numpy().item(0) if isinstance(item, Variable) else item
                     for item in shape
                 ]
-                out, _ = _C_ops.reshape2(x, None, 'shape', shape)
+                out, _ = _legacy_C_ops.reshape2(x, None, 'shape', shape)
             elif isinstance(shape, tmp_tensor_type):
                 shape.stop_gradient = True
-                out, _ = _C_ops.reshape2(x, shape)
+                out, _ = _legacy_C_ops.reshape2(x, shape)
             else:
                 raise ValueError(
                     "shape must be an instance of `list`, `tuple` or `Variable`,"
@@ -6614,9 +6615,9 @@ def squeeze(input, axes, name=None):
 
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_squeeze(input, axes)
+        return _C_ops.squeeze(input, axes)
     if _in_legacy_dygraph():
-        out, _ = _C_ops.squeeze2(input, 'axes', axes)
+        out, _ = _legacy_C_ops.squeeze2(input, 'axes', axes)
         return out
 
     helper = LayerHelper("squeeze", **locals())
@@ -6688,9 +6689,9 @@ def unsqueeze(input, axes, name=None):
                 for item in axes
             ]
         if _in_legacy_dygraph():
-            out, _ = _C_ops.unsqueeze2(input, 'axes', axes)
+            out, _ = _legacy_C_ops.unsqueeze2(input, 'axes', axes)
             return out
-        return _C_ops.final_state_unsqueeze(input, axes)
+        return _C_ops.unsqueeze(input, axes)
 
     check_type(axes, 'axis/axes', (int, list, tuple, Variable), 'unsqueeze')
     check_variable_and_dtype(input, 'input', [
@@ -7233,14 +7234,14 @@ def label_smooth(label,
                 label=one_hot_label, epsilon=0.1, dtype="float32")
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_label_smooth(label, prior_dist,
-                                               float(epsilon))
+        return _C_ops.label_smooth(label, prior_dist, float(epsilon))
 
     if epsilon > 1. or epsilon < 0.:
         raise ValueError("The value of epsilon must be between 0 and 1.")
 
     if _non_static_mode():
-        return _C_ops.label_smooth(label, prior_dist, 'epsilon', float(epsilon))
+        return _legacy_C_ops.label_smooth(label, prior_dist, 'epsilon',
+                                          float(epsilon))
 
     check_variable_and_dtype(label, 'label', ['float32', 'float64'],
                              'label_smooth')
@@ -7332,10 +7333,9 @@ def roi_pool(input,
     """
     if _non_static_mode():
         assert rois_num is not None, "rois_num should not be None in dygraph mode."
-        pool_out, argmaxes = _C_ops.roi_pool(input, rois, rois_num,
-                                             "pooled_height", pooled_height,
-                                             "pooled_width", pooled_width,
-                                             "spatial_scale", spatial_scale)
+        pool_out, argmaxes = _legacy_C_ops.roi_pool(
+            input, rois, rois_num, "pooled_height", pooled_height,
+            "pooled_width", pooled_width, "spatial_scale", spatial_scale)
         return pool_out, argmaxes
 
     check_variable_and_dtype(input, 'input', ['float32'], 'roi_pool')
@@ -7422,17 +7422,16 @@ def roi_align(input,
     """
     if in_dygraph_mode():
         assert rois_num is not None, "rois_num should not be None in dygraph mode."
-        return _C_ops.final_state_roi_align(input, rois, rois_num,
-                                            pooled_height, pooled_width,
-                                            spatial_scale, sampling_ratio,
-                                            False)
+        return _C_ops.roi_align(input, rois, rois_num, pooled_height,
+                                pooled_width, spatial_scale, sampling_ratio,
+                                False)
     if _in_legacy_dygraph():
         assert rois_num is not None, "rois_num should not be None in dygraph mode."
-        align_out = _C_ops.roi_align(input, rois, rois_num, "pooled_height",
-                                     pooled_height, "pooled_width",
-                                     pooled_width, "spatial_scale",
-                                     spatial_scale, "sampling_ratio",
-                                     sampling_ratio)
+        align_out = _legacy_C_ops.roi_align(input, rois, rois_num,
+                                            "pooled_height", pooled_height,
+                                            "pooled_width", pooled_width,
+                                            "spatial_scale", spatial_scale,
+                                            "sampling_ratio", sampling_ratio)
         return align_out
 
     check_variable_and_dtype(input, 'input', ['float32', 'float64'],
@@ -7992,15 +7991,15 @@ def _is_list_or_turple_(data):
         dy_attr = tuple(attr_list)
 
         if resample_type == "linear":
-            out = _C_ops.linear_interp(input, actual_shape, *dy_attr)
+            out = _legacy_C_ops.linear_interp(input, actual_shape, *dy_attr)
         elif resample_type == "bilinear":
-            out = _C_ops.bilinear_interp(input, actual_shape, *dy_attr)
+            out = _legacy_C_ops.bilinear_interp(input, actual_shape, *dy_attr)
         elif resample_type == "trilinear":
-            out = _C_ops.trilinear_interp(input, actual_shape, *dy_attr)
+            out = _legacy_C_ops.trilinear_interp(input, actual_shape, *dy_attr)
         elif resample_type == "nearest":
-            out = _C_ops.nearest_interp(input, actual_shape, *dy_attr)
+            out = _legacy_C_ops.nearest_interp(input, actual_shape, *dy_attr)
         elif resample_type == "bicubic":
-            out = _C_ops.bicubic_interp(input, actual_shape, *dy_attr)
+            out = _legacy_C_ops.bicubic_interp(input, actual_shape, *dy_attr)
         return out
 
     out = helper.create_variable_for_type_inference(dtype)
@@ -8727,7 +8726,7 @@ def gather(input, index, overwrite=True):
             output = fluid.layers.gather(x, index)
     """
     if _non_static_mode():
-        return _C_ops.gather(input, index, None, 'overwrite', overwrite)
+        return _legacy_C_ops.gather(input, index, None, 'overwrite', overwrite)
 
     check_variable_and_dtype(
         input, 'x',
@@ -8822,10 +8821,10 @@ def gather_nd(input, index, name=None):
 
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_gather_nd(input, index)
+        return _C_ops.gather_nd(input, index)
     else:
         if _in_legacy_dygraph():
-            return _C_ops.gather_nd(input, index)
+            return _legacy_C_ops.gather_nd(input, index)
     check_variable_and_dtype(
         input, 'input',
         ['bool', 'float32', 'float64', 'int16', 'int32', 'int64'], 'gather_np')
@@ -9003,10 +9002,10 @@ def scatter_nd_add(ref, index, updates, name=None):
     """
 
     if in_dygraph_mode():
-        return _C_ops.final_state_scatter_nd_add(ref, index, updates)
+        return _C_ops.scatter_nd_add(ref, index, updates)
     else:
         if _in_legacy_dygraph():
-            op = getattr(_C_ops, 'scatter_nd_add')
+            op = getattr(_legacy_C_ops, 'scatter_nd_add')
             return op(ref, index, updates)
         else:
             if ref.dtype != updates.dtype:
@@ -9156,9 +9155,9 @@ def log(x, name=None):
             # [[0.693147, 1.09861, 1.38629], [1.94591, 2.07944, 2.19722]]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_log(x)
-    if _in_legacy_dygraph():
         return _C_ops.log(x)
+    if _in_legacy_dygraph():
+        return _legacy_C_ops.log(x)
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], "log")
     inputs = {'X': [x]}
@@ -9199,9 +9198,9 @@ def relu(x, name=None):
 """
 
     if in_dygraph_mode():
-        return _C_ops.final_state_relu(x)
-    if _in_legacy_dygraph():
         return _C_ops.relu(x)
+    if _in_legacy_dygraph():
+        return _legacy_C_ops.relu(x)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'relu')
 
@@ -9332,7 +9331,7 @@ def mean_iou(input, label, num_classes):
             mean_iou, out_wrong, out_correct = paddle.metric.mean_iou(predict, label, num_classes)
     """
     if _non_static_mode():
-        return _C_ops.mean_iou(input, label, 'num_classes', num_classes)
+        return _legacy_C_ops.mean_iou(input, label, 'num_classes', num_classes)
 
     helper = LayerHelper('mean_iou', **locals())
     check_variable_and_dtype(input, 'Predictions', ['int32', 'int64'],
@@ -9813,8 +9812,9 @@ def pad2d(input,
     if _non_static_mode():
         _paddings = paddings.numpy().tolist() if isinstance(
             paddings, Variable) else paddings
-        return _C_ops.pad2d(input, 'mode', mode, 'pad_value', pad_value,
-                            'data_format', data_format, 'paddings', _paddings)
+        return _legacy_C_ops.pad2d(input, 'mode', mode, 'pad_value', pad_value,
+                                   'data_format', data_format, 'paddings',
+                                   _paddings)
 
     check_variable_and_dtype(
         input, 'input', ['float16', 'float32', 'float64', 'int32', 'int64'],
@@ -10012,7 +10012,7 @@ def stanh(x, scale_a=0.67, scale_b=1.7159, name=None):
     """
 
     if _non_static_mode():
-        return _C_ops.stanh(x, 'scale_a', scale_a, 'scale_b', scale_b)
+        return _legacy_C_ops.stanh(x, 'scale_a', scale_a, 'scale_b', scale_b)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'stanh')
 
@@ -10055,7 +10055,7 @@ def hard_sigmoid(x, slope=0.2, offset=0.5, name=None):
             result = fluid.layers.hard_sigmoid(data) # [[0.6, 0.6], [0.6, 0.6], [0.6, 0.6]]
     """
     if _non_static_mode():
-        return _C_ops.hard_sigmoid(x, 'slope', slope, 'offset', offset)
+        return _legacy_C_ops.hard_sigmoid(x, 'slope', slope, 'offset', offset)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
                              'hard_sigmoid')
@@ -10290,7 +10290,7 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None):
                 #[ 1. 10.]]
     """
     if _non_static_mode():
-        return _C_ops.brelu(x, 't_min', t_min, 't_max', t_max)
+        return _legacy_C_ops.brelu(x, 't_min', t_min, 't_max', t_max)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'brelu')
 
@@ -10449,7 +10449,7 @@ def flatten(x, axis=1, name=None):
         x, 'x', ['float32', 'float64', 'int8', 'int32', 'int64', 'uint8'],
         'flatten')
     if _non_static_mode():
-        return _C_ops.flatten2(x, 'axis', axis)[0]
+        return _legacy_C_ops.flatten2(x, 'axis', axis)[0]
 
     helper = LayerHelper('flatten', **locals())
 
@@ -10553,10 +10553,10 @@ def stack(x, axis=0, name=None):
     axis = 0 if axis is None else axis
 
     if in_dygraph_mode():
-        return _C_ops.final_state_stack(x, axis)
+        return _C_ops.stack(x, axis)
 
     if _in_legacy_dygraph():
-        return _C_ops.stack(x, 'axis', axis)
+        return _legacy_C_ops.stack(x, 'axis', axis)
 
     if not isinstance(x, list) and not isinstance(x, tuple):
         # NOTE:(zhiqiu) Only support Variable as input if the Variable is a LOD_TENSOR_ARRAY create by create_array, array_write, array_read, etc.
@@ -10719,7 +10719,7 @@ def unstack(x, axis=0, num=None):
             num = x.shape[axis]
         if num == 0:
             return []
-        return _C_ops.unstack(x, num, 'axis', int(axis), 'num', num)
+        return _legacy_C_ops.unstack(x, num, 'axis', int(axis), 'num', num)
 
     helper = LayerHelper('unstack', **locals())
     if num is None:
@@ -10816,7 +10816,7 @@ def expand(x, expand_times, name=None):
             expand_times_tensor = expand_times
             expand_times_tensor.stop_gradient = True
 
-        return _C_ops.expand(x, expand_times_tensor, *attrs)
+        return _legacy_C_ops.expand(x, expand_times_tensor, *attrs)
 
     inputs = {"X": [x]}
     attrs = {}
@@ -10928,7 +10928,7 @@ def expand_as(x, target_tensor, name=None):
 
     """
     if _non_static_mode():
-        return _C_ops.expand_as(x, target_tensor)
+        return _legacy_C_ops.expand_as(x, target_tensor)
 
     check_variable_and_dtype(x, 'x',
                              ['float32', 'float64', 'int32', 'int64', 'bool'],
@@ -11150,15 +11150,15 @@ def gaussian_random(shape,
     if in_dygraph_mode():
         shape = utils.convert_shape_to_list(shape)
         place = _current_expected_place()
-        return _C_ops.final_state_gaussian_random(shape, float(mean),
-                                                  float(std), seed, dtype,
-                                                  place)
+        return _C_ops.gaussian_random(shape, float(mean), float(std), seed,
+                                      dtype, place)
 
     if _in_legacy_dygraph():
         shape = utils.convert_shape_to_list(shape)
-        return _C_ops.gaussian_random('shape',
-                                      shape, 'mean', float(mean), 'std',
-                                      float(std), 'seed', seed, 'dtype', dtype)
+        return _legacy_C_ops.gaussian_random('shape', shape,
+                                             'mean', float(mean), 'std',
+                                             float(std), 'seed', seed, 'dtype',
+                                             dtype)
 
     check_type(shape, 'shape', (list, tuple, Variable), 'gaussian_random/randn')
     check_dtype(dtype, 'dtype', ['float32', 'float64'], 'gaussian_random/randn')
@@ -11480,8 +11480,7 @@ def slice(input, axes, starts, ends):
             tensor_t = ends.numpy()
             ends = [ele for ele in tensor_t]
 
-        return _C_ops.final_state_slice(input, axes, starts, ends, infer_flags,
-                                        [])
+        return _C_ops.slice(input, axes, starts, ends, infer_flags, [])
     else:
         if _in_legacy_dygraph():
             attrs = ()
@@ -11532,9 +11531,9 @@ def slice(input, axes, starts, ends):
                 ends_tensor.stop_gradient = True
                 infer_flags = list(-1 for i in range(len(axes)))
 
-            return _C_ops.slice(input, starts_tensor, ends_tensor, None, None,
-                                'axes', axes, 'infer_flags', infer_flags,
-                                *attrs)
+            return _legacy_C_ops.slice(input, starts_tensor, ends_tensor, None,
+                                       None, 'axes', axes, 'infer_flags',
+                                       infer_flags, *attrs)
 
     if not isinstance(starts, (list, tuple, Variable)):
         raise ValueError(
@@ -11699,8 +11698,7 @@ def strided_slice(input, axes, starts, ends, strides):
             # sliced_2 is input[:, 0:3:1, 0:2:1, 2:4:2].
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_strided_slice(input, axes, starts, ends,
-                                                strides)
+        return _C_ops.strided_slice(input, axes, starts, ends, strides)
 
     helper = LayerHelper('strided_slice', **locals())
 
@@ -11869,11 +11867,11 @@ def shape(input):
             print(res) # [array([  3, 100, 100], dtype=int32)]
     """
     if in_dygraph_mode():
-        out = _C_ops.final_state_shape(input)
+        out = _C_ops.shape(input)
         out.stop_gradient = True
         return out
     if _in_legacy_dygraph():
-        out = _C_ops.shape(input)
+        out = _legacy_C_ops.shape(input)
         out.stop_gradient = True
         return out
 
@@ -11948,10 +11946,10 @@ def size(input):
     """
 
     if in_dygraph_mode():
-        return _C_ops.final_state_size(input)
+        return _C_ops.size(input)
 
     if _in_legacy_dygraph():
-        return _C_ops.size(input)
+        return _legacy_C_ops.size(input)
 
     check_variable_and_dtype(
         input, 'input',
@@ -12043,12 +12041,13 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
     """
 
     if in_dygraph_mode():
-        out = _C_ops.final_state_scale(x, scale, float(bias), bias_after_scale)
+        out = _C_ops.scale(x, scale, float(bias), bias_after_scale)
         return dygraph_utils._append_activation_in_dygraph(out)
     if _non_static_mode():
         _scale = scale.numpy().item(0) if isinstance(scale, Variable) else scale
-        out = _C_ops.scale(x, 'scale', float(_scale), 'bias', float(bias),
-                           'bias_after_scale', bias_after_scale)
+        out = _legacy_C_ops.scale(x, 'scale', float(_scale), 'bias',
+                                  float(bias), 'bias_after_scale',
+                                  bias_after_scale)
         return dygraph_utils._append_activation_in_dygraph(out)
 
     check_variable_and_dtype(x, "x", [
@@ -12778,7 +12777,7 @@ def gen_data():
 
 def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
     if _non_static_mode():
-        op = getattr(_C_ops, op_name)
+        op = getattr(_legacy_C_ops, op_name)
         if binary_op:
             return op(x, y)
         else:
@@ -12851,7 +12850,7 @@ def logical_and(x, y, out=None, name=None):
             print(res) # [True False True False]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_logical_and(x, y)
+        return _C_ops.logical_and(x, y)
 
     return _logical_op(op_name="logical_and",
                        x=x,
@@ -12897,7 +12896,7 @@ def logical_or(x, y, out=None, name=None):
             print(res) # [[ True  True] [ True False]]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_logical_or(x, y)
+        return _C_ops.logical_or(x, y)
     return _logical_op(op_name="logical_or",
                        x=x,
                        y=y,
@@ -12942,7 +12941,7 @@ def logical_xor(x, y, out=None, name=None):
             print(res) # [[False,  True], [ True, False]]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_logical_xor(x, y)
+        return _C_ops.logical_xor(x, y)
 
     return _logical_op(op_name="logical_xor",
                        x=x,
@@ -12981,7 +12980,7 @@ def logical_not(x, out=None, name=None):
             print(res) # [False  True False  True]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_logical_not(x)
+        return _C_ops.logical_not(x)
     return _logical_op(op_name="logical_not",
                        x=x,
                        y=None,
@@ -13073,9 +13072,9 @@ def clip_by_norm(x, max_norm, name=None):
     """
 
     if in_dygraph_mode():
-        return _C_ops.final_state_clip_by_norm(x, max_norm)
+        return _C_ops.clip_by_norm(x, max_norm)
     if _non_static_mode():
-        return _C_ops.clip_by_norm(x, 'max_norm', max_norm)
+        return _legacy_C_ops.clip_by_norm(x, 'max_norm', max_norm)
 
     helper = LayerHelper("clip_by_norm", **locals())
     check_variable_and_dtype(x, 'X', ['float32', 'float16'], 'clip_by_norm')
@@ -13124,9 +13123,9 @@ def mean(x, name=None):
     """
 
     if _in_legacy_dygraph():
-        return _C_ops.mean(x)
+        return _legacy_C_ops.mean(x)
     if in_dygraph_mode():
-        return _C_ops.final_state_mean_all(x)
+        return _C_ops.mean_all(x)
 
     helper = LayerHelper("mean", **locals())
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'mean')
@@ -13163,7 +13162,7 @@ def merge_selected_rows(x, name=None):
             y = fluid.layers.merge_selected_rows(var)
     """
     if _non_static_mode():
-        return _C_ops.merge_selected_rows(x)
+        return _legacy_C_ops.merge_selected_rows(x)
 
     helper = LayerHelper("merge_selected_rows", **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -13210,8 +13209,8 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None):
 
     """
     if _non_static_mode():
-        return _C_ops.mul(x, y, 'x_num_col_dims', x_num_col_dims,
-                          'y_num_col_dims', y_num_col_dims)
+        return _legacy_C_ops.mul(x, y, 'x_num_col_dims', x_num_col_dims,
+                                 'y_num_col_dims', y_num_col_dims)
 
     inputs = {"X": [x], "Y": [y]}
     attrs = {"x_num_col_dims": x_num_col_dims, "y_num_col_dims": y_num_col_dims}
@@ -13828,7 +13827,8 @@ def add_position_encoding(input, alpha, beta, name=None):
 
     """
     if _non_static_mode():
-        return _C_ops.add_position_encoding(input, "alpha", alpha, "beta", beta)
+        return _legacy_C_ops.add_position_encoding(input, "alpha", alpha,
+                                                   "beta", beta)
 
     helper = LayerHelper('add_position_encoding', **locals())
     check_variable_and_dtype(input, 'input', ['float32', 'float64'],
@@ -14759,9 +14759,9 @@ def where(condition):
     """
 
     if in_dygraph_mode():
-        return _C_ops.final_state_where_index(condition)
-    if _in_legacy_dygraph():
         return _C_ops.where_index(condition)
+    if _in_legacy_dygraph():
+        return _legacy_C_ops.where_index(condition)
 
     helper = LayerHelper("where_index", **locals())
 
@@ -15432,8 +15432,8 @@ def shard_index(input, index_num, nshards, shard_id, ignore_value=-1):
             # [[-1], [1]]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_shard_index(input, index_num, nshards,
-                                              shard_id, ignore_value)
+        return _C_ops.shard_index(input, index_num, nshards, shard_id,
+                                  ignore_value)
 
     check_variable_and_dtype(input, 'input', ['int64', 'int32'], 'shard_index')
     op_type = 'shard_index'
@@ -15507,8 +15507,8 @@ def hard_swish(x, threshold=6.0, scale=6.0, offset=3.0, name=None):
         print(out)  # [[0.66666667, 1.66666667,3., 4.]]
     """
     if _non_static_mode():
-        return _C_ops.hard_swish(x, 'threshold', threshold, 'scale', scale,
-                                 'offset', offset)
+        return _legacy_C_ops.hard_swish(x, 'threshold', threshold, 'scale',
+                                        scale, 'offset', offset)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
                              'hard_swish')
@@ -15587,9 +15587,9 @@ def mish(x, threshold=20, name=None):
         print(out)  # [[0.66666667, 1.66666667, 3., 4.]]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_mish(x, threshold)
+        return _C_ops.mish(x, threshold)
     if _in_legacy_dygraph():
-        return _C_ops.mish(x, 'threshold', threshold)
+        return _legacy_C_ops.mish(x, 'threshold', threshold)
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'mish')
     check_type(threshold, 'threshold', (float, int), 'mish')
@@ -15756,13 +15756,14 @@ def uniform_random(shape,
 
     if in_dygraph_mode():
         shape = utils.convert_shape_to_list(shape)
-        return _C_ops.final_state_uniform_random(shape, dtype, float(min),
-                                                 float(max), seed,
-                                                 _current_expected_place())
+        return _C_ops.uniform_random(shape, dtype, float(min), float(max), seed,
+                                     _current_expected_place())
     elif _in_legacy_dygraph():
         shape = utils.convert_shape_to_list(shape)
-        return _C_ops.uniform_random('shape', shape, 'min', float(min), 'max',
-                                     float(max), 'seed', seed, 'dtype', dtype)
+        return _legacy_C_ops.uniform_random('shape',
+                                            shape, 'min', float(min), 'max',
+                                            float(max), 'seed', seed, 'dtype',
+                                            dtype)
 
     check_type(shape, 'shape', (list, tuple, Variable), 'uniform_random/rand')
     check_dtype(dtype, 'dtype', ('float32', 'float64', 'uint16'),
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 01da331e57b2e..51b72267329cf 100755
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -19,7 +19,7 @@
 from ..framework import convert_np_dtype_to_dtype_, Variable, in_dygraph_mode
 from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 from paddle.utils import deprecated
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 import paddle
 
 __deprecated_func_name__ = {
@@ -818,7 +818,7 @@ def gelu(x, approximate=False):
 
 def erf(x, name=None):
     if in_dygraph_mode():
-        return _C_ops.final_state_erf(x)
+        return _C_ops.erf(x)
 
     locals_var = locals().copy()
     kwargs = dict()
diff --git a/python/paddle/fluid/layers/sequence_lod.py b/python/paddle/fluid/layers/sequence_lod.py
index 6d8b0992ff014..c0ebe1adb61b7 100644
--- a/python/paddle/fluid/layers/sequence_lod.py
+++ b/python/paddle/fluid/layers/sequence_lod.py
@@ -20,7 +20,7 @@
 from ..layer_helper import LayerHelper
 from ..data_feeder import check_variable_and_dtype, check_type, check_dtype
 from ..core import VarDesc
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 __all__ = [
     'sequence_conv',
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index c073b003345c1..1eefe759c708d 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -32,7 +32,7 @@
 from paddle.utils import deprecated
 
 from .utils import check_shape
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 __all__ = [
     'create_tensor',
@@ -245,12 +245,12 @@ def cast(x, dtype):
     if in_dygraph_mode():
         if not isinstance(dtype, core.VarDesc.VarType):
             dtype = convert_np_dtype_to_dtype_(dtype)
-        return _C_ops.final_state_cast(x, dtype)
+        return _C_ops.cast(x, dtype)
 
     if _non_static_mode():
         if not isinstance(dtype, core.VarDesc.VarType):
             dtype = convert_np_dtype_to_dtype_(dtype)
-        out = _C_ops.cast(x, 'in_dtype', x.dtype, 'out_dtype', dtype)
+        out = _legacy_C_ops.cast(x, 'in_dtype', x.dtype, 'out_dtype', dtype)
         return out
 
     check_variable_and_dtype(x, 'x', [
@@ -329,7 +329,7 @@ def concat(input, axis=0, name=None):
             axis = axis.item(0)
         if not isinstance(input, Variable):
             input = [t for t in input if t.shape.count(0) == 0]
-        out = _C_ops.final_state_concat(input, axis)
+        out = _C_ops.concat(input, axis)
         return out
 
     if _in_legacy_dygraph():
@@ -339,7 +339,7 @@ def concat(input, axis=0, name=None):
         if not isinstance(input, Variable):
             input = [t for t in input if t.shape.count(0) == 0]
         out = _varbase_creator()
-        _C_ops.concat(input, out, 'axis', axis)
+        _legacy_C_ops.concat(input, out, 'axis', axis)
         return out
 
     check_type(input, 'input', (list, tuple, Variable), 'concat')
@@ -633,14 +633,14 @@ def assign(input, output=None):
     if isinstance(input, (Variable, core.VarBase)):
         if _non_static_mode():
             if in_dygraph_mode() and output is None:
-                output = _C_ops.final_state_assign(input)
+                output = _C_ops.assign(input)
             else:
                 if output is None:
                     if _in_legacy_dygraph():
                         output = core.VarBase()
                     else:
                         output = core.eager.Tensor()
-                _C_ops.assign(input, output)
+                _legacy_C_ops.assign(input, output)
         else:
             check_dtype(input.dtype, 'input', [
                 'float16', 'uint16', 'float32', 'float64', 'int32', 'int64',
@@ -690,13 +690,13 @@ def assign(input, output=None):
         if in_dygraph_mode():
             if output is None:
                 output = zeros(list(input.shape), dtype)
-            _C_ops.final_state_assign_value_(output, list(input.shape), dtype,
-                                             values, _current_expected_place())
+            _C_ops.assign_value_(output, list(input.shape), dtype, values,
+                                 _current_expected_place())
         elif _in_legacy_dygraph():
             if output is None:
                 output = core.VarBase()
-            _C_ops.assign_value(output, 'shape', list(input.shape), 'dtype',
-                                dtype, value_name, values)
+            _legacy_C_ops.assign_value(output, 'shape', list(input.shape),
+                                       'dtype', dtype, value_name, values)
         else:
             if output is None:
                 output = helper.create_variable_for_type_inference(
@@ -790,13 +790,13 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
             dtype = convert_np_dtype_to_dtype_(dtype)
 
         if out is None:
-            out = _C_ops.final_state_full(shape, float(value), dtype, place)
+            out = _C_ops.full(shape, float(value), dtype, place)
             out.stop_gradient = True
             return out
 
         if out is not None:
             # final state mode is support out is not None.
-            _C_ops.final_state_full_(out, shape, float(value), dtype, place)
+            _C_ops.full_(out, shape, float(value), dtype, place)
             out.stop_gradient = True
             return out
 
@@ -811,9 +811,9 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
             else:
                 attrs['str_value'] = str(float(value.numpy().item(0)))
 
-        _C_ops.fill_constant(out, 'value', float(value), 'force_cpu', force_cpu,
-                             'dtype', out.dtype, 'str_value',
-                             attrs['str_value'], 'shape', shape)
+        _legacy_C_ops.fill_constant(out, 'value', float(value), 'force_cpu',
+                                    force_cpu, 'dtype', out.dtype, 'str_value',
+                                    attrs['str_value'], 'shape', shape)
         out.stop_gradient = True
         return out
 
@@ -903,9 +903,8 @@ def fill_constant_batch_size_like(input,
         place = _current_expected_place()
         if force_cpu:
             place = core.CPUPlace()
-        out = _C_ops.final_state_full_batch_size_like(input, shape, dtype,
-                                                      value, input_dim_idx,
-                                                      output_dim_idx, place)
+        out = _C_ops.full_batch_size_like(input, shape, dtype, value,
+                                          input_dim_idx, output_dim_idx, place)
         out.stop_gradient = True
         return out
 
@@ -1284,9 +1283,9 @@ def reverse(x, axis):
         axis = [axis]
     if in_dygraph_mode():
         if x.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
-            return _C_ops.final_state_reverse_array(x, axis)
+            return _C_ops.reverse_array(x, axis)
         else:
-            return _C_ops.final_state_reverse(x, axis)
+            return _C_ops.reverse(x, axis)
     helper = LayerHelper("reverse", **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(type='reverse',
@@ -1390,7 +1389,7 @@ def has_inf(x):
 
     """
     if _non_static_mode():
-        return _C_ops.isinf(x)
+        return _legacy_C_ops.isinf(x)
 
     check_type(x, 'x', (Variable), 'has_inf')
     helper = LayerHelper("isinf", **locals())
@@ -1419,7 +1418,7 @@ def has_nan(x):
 
     """
     if _non_static_mode():
-        return _C_ops.isnan(x)
+        return _legacy_C_ops.isnan(x)
 
     check_type(x, 'x', (Variable), 'has_nan')
     helper = LayerHelper("isnan", **locals())
@@ -1536,11 +1535,10 @@ def range(start, end, step, dtype, name=None):
         step = cast(step, dtype)
 
     if in_dygraph_mode():
-        return _C_ops.final_state_arange(start, end, step, dtype,
-                                         _current_expected_place())
+        return _C_ops.arange(start, end, step, dtype, _current_expected_place())
 
     if _in_legacy_dygraph():
-        out = _C_ops.range(start, end, step)
+        out = _legacy_C_ops.range(start, end, step)
         out.stop_gradient = True
         return out
 
@@ -1609,11 +1607,10 @@ def linspace(start, stop, num, dtype=None, name=None):
         with device_guard("cpu"):
             tensor_num = fill_constant([1], 'int32', num)
     if in_dygraph_mode():
-        return _C_ops.final_state_linspace(tensor_start, tensor_stop,
-                                           tensor_num, dtype)
+        return _C_ops.linspace(tensor_start, tensor_stop, tensor_num, dtype)
     if _in_legacy_dygraph():
-        return _C_ops.linspace(tensor_start, tensor_stop, tensor_num, 'dtype',
-                               dtype)
+        return _legacy_C_ops.linspace(tensor_start, tensor_stop, tensor_num,
+                                      'dtype', dtype)
     helper = LayerHelper("linspace", **locals())
 
     start_dtype = convert_dtype(tensor_start.dtype)
@@ -1803,11 +1800,11 @@ def eye(num_rows,
         num_columns = num_rows
 
     if in_dygraph_mode():
-        out = _C_ops.final_state_eye(num_rows, num_columns, dtype,
-                                     _current_expected_place())
+        out = _C_ops.eye(num_rows, num_columns, dtype,
+                         _current_expected_place())
     elif _in_legacy_dygraph():
-        out = _C_ops.eye('dtype', dtype, 'num_rows', num_rows, 'num_columns',
-                         num_columns)
+        out = _legacy_C_ops.eye('dtype', dtype, 'num_rows', num_rows,
+                                'num_columns', num_columns)
     else:
         helper = LayerHelper("eye", **locals())
         check_dtype(dtype, 'dtype',
@@ -1830,8 +1827,8 @@ def eye(num_rows,
         re_shape = re_shape + [num_rows, num_columns]
         expand_times = batch_shape + [1, 1]
         if _non_static_mode():
-            out = _C_ops.reshape(out, 'shape', re_shape)
-            return _C_ops.expand(out, None, 'expand_times', expand_times)
+            out = _legacy_C_ops.reshape(out, 'shape', re_shape)
+            return _legacy_C_ops.expand(out, None, 'expand_times', expand_times)
 
         if not isinstance(batch_shape, list):
             raise TypeError("batch_shape should be a list")
diff --git a/python/paddle/fluid/op.py b/python/paddle/fluid/op.py
index d5be4423775b4..4581248d06ac2 100644
--- a/python/paddle/fluid/op.py
+++ b/python/paddle/fluid/op.py
@@ -124,6 +124,8 @@ def __call__(self, *args, **kwargs):
                     new_attr.bools.extend(user_defined_attr)
                 elif attr.type == framework_pb2.LONGS:
                     new_attr.longs.extend(user_defined_attr)
+                elif attr.type == framework_pb2.FLOAT64:
+                    new_attr.float64 = user_defined_attr
                 else:
                     raise NotImplementedError(
                         "A not supported attribute type: %s." %
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index d3ab3e72d0314..c53872c0e54da 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -43,7 +43,7 @@
 from .wrapped_decorator import signature_safe_contextmanager
 from .. import compat as cpt
 import warnings
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from ..fluid.framework import _in_legacy_dygraph, in_dygraph_mode, _current_expected_place
 
 __all__ = [
@@ -445,14 +445,14 @@ def set_lr(self, value):
             if current_lr is not None:
                 if in_dygraph_mode():
                     place = _current_expected_place()
-                    _C_ops.final_state_full_(current_lr, list(current_lr.shape),
-                                             float(value), current_lr.dtype,
-                                             place)
+                    _C_ops.full_(current_lr, list(current_lr.shape),
+                                 float(value), current_lr.dtype, place)
 
                 elif _in_legacy_dygraph():
-                    _C_ops.fill_constant(current_lr, 'value', float(value),
-                                         'dtype', current_lr.dtype, 'shape',
-                                         list(current_lr.shape))
+                    _legacy_C_ops.fill_constant(current_lr, 'value',
+                                                float(value), 'dtype',
+                                                current_lr.dtype, 'shape',
+                                                list(current_lr.shape))
                 else:
                     global_block = framework.default_main_program(
                     ).global_block()
@@ -949,7 +949,7 @@ def _create_regularization_of_grad(self, param, grad, regularization=None):
         assert regularization_term is not None
 
         if framework._non_static_mode():
-            return _C_ops.sum([grad, regularization_term])
+            return _legacy_C_ops.sum([grad, regularization_term])
 
         new_grad = grad
         if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
@@ -1376,12 +1376,12 @@ def _append_optimize_op(self, block, param_and_grad):
 
         lr = self._create_param_lr(param_and_grad)
         if in_dygraph_mode():
-            _C_ops.final_state_sgd_(param_and_grad[0], lr, param_and_grad[1],
-                                    master_weight, find_master)
+            _C_ops.sgd_(param_and_grad[0], lr, param_and_grad[1], master_weight,
+                        find_master)
             return None
         if _in_legacy_dygraph():
-            _C_ops.sgd(param_and_grad[0], lr, param_and_grad[1], master_weight,
-                       param_and_grad[0], master_weight)
+            _legacy_C_ops.sgd(param_and_grad[0], lr, param_and_grad[1],
+                              master_weight, param_and_grad[0], master_weight)
             return None
 
         assert isinstance(block, framework.Block)
@@ -1514,11 +1514,10 @@ def _append_optimize_op(self, block, param_and_grad):
         lr = self._create_param_lr(param_and_grad)
         master_weight = None
         if framework._non_static_mode():
-            _, _, _ = _C_ops.momentum(param_and_grad[0], param_and_grad[1],
-                                      velocity_acc, lr, master_weight,
-                                      param_and_grad[0], velocity_acc,
-                                      master_weight, 'mu', self._momentum,
-                                      'use_nesterov', self._use_nesterov)
+            _, _, _ = _legacy_C_ops.momentum(
+                param_and_grad[0], param_and_grad[1], velocity_acc, lr,
+                master_weight, param_and_grad[0], velocity_acc, master_weight,
+                'mu', self._momentum, 'use_nesterov', self._use_nesterov)
             return None
 
         attrs = {"mu": self._momentum, "use_nesterov": self._use_nesterov}
@@ -2171,7 +2170,7 @@ def _append_optimize_op(self, block, param_and_grad):
             outputs["MasterParamOut"] = master_weight
 
         if framework._non_static_mode():
-            tmp, tmp2 = _C_ops.lars_momentum(
+            tmp, tmp2 = _legacy_C_ops.lars_momentum(
                 [param_and_grad[0]], [param_and_grad[1]], [velocity_acc], [lr],
                 [param_and_grad[0]], [velocity_acc], "mu", self._momentum,
                 "lars_coeff", self._lars_coeff, "lars_weight_decay",
@@ -2286,16 +2285,16 @@ def _append_optimize_op(self, block, param_and_grad):
         moment_acc = self._get_accumulator(self._moment_acc_str,
                                            param_and_grad[0])
         if in_dygraph_mode():
-            _C_ops.final_state_adagrad_(param_and_grad[0], param_and_grad[1],
-                                        moment_acc,
-                                        self._create_param_lr(param_and_grad),
-                                        self._epsilon)
+            _C_ops.adagrad_(param_and_grad[0], param_and_grad[1], moment_acc,
+                            self._create_param_lr(param_and_grad),
+                            self._epsilon)
             return None
         elif _in_legacy_dygraph():
-            _C_ops.adagrad(param_and_grad[0], param_and_grad[1], moment_acc,
-                           self._create_param_lr(param_and_grad),
-                           param_and_grad[0], moment_acc, "epsilon",
-                           self._epsilon)
+            _legacy_C_ops.adagrad(param_and_grad[0], param_and_grad[1],
+                                  moment_acc,
+                                  self._create_param_lr(param_and_grad),
+                                  param_and_grad[0], moment_acc, "epsilon",
+                                  self._epsilon)
             return None
         else:
             # Create the adagrad optimizer op
@@ -2574,7 +2573,7 @@ def _append_optimize_op(self, block, param_and_grad):
             _beta2 = self._beta2 if not isinstance(
                 self._beta2, Variable) else self._beta2.numpy().item(0)
             master_weight = None
-            _, _, _, _, _, _ = _C_ops.adam(
+            _, _, _, _, _, _ = _legacy_C_ops.adam(
                 param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
                 beta1_pow_acc, beta2_pow_acc, master_weight, param_and_grad[0],
                 moment1, moment2, beta1_pow_acc, beta2_pow_acc, master_weight,
@@ -2813,16 +2812,16 @@ def _append_optimize_op(self, block, param_and_grad):
                                               param_and_grad[0])
 
         if framework.in_dygraph_mode():
-            _C_ops.final_state_adamax_(param_and_grad[0], param_and_grad[1],
-                                       self._create_param_lr(param_and_grad),
-                                       moment, inf_norm, beta1_pow_acc,
-                                       self._beta1, self._beta2, self._epsilon)
+            _C_ops.adamax_(param_and_grad[0], param_and_grad[1],
+                           self._create_param_lr(param_and_grad), moment,
+                           inf_norm, beta1_pow_acc, self._beta1, self._beta2,
+                           self._epsilon)
         elif framework._in_legacy_dygraph():
-            _C_ops.adamax(param_and_grad[0], param_and_grad[1],
-                          self._create_param_lr(param_and_grad), moment,
-                          inf_norm, beta1_pow_acc, param_and_grad[0], moment,
-                          inf_norm, "beta1", self._beta1, "beta2", self._beta2,
-                          "epsilon", self._epsilon)
+            _legacy_C_ops.adamax(param_and_grad[0], param_and_grad[1],
+                                 self._create_param_lr(param_and_grad), moment,
+                                 inf_norm, beta1_pow_acc, param_and_grad[0],
+                                 moment, inf_norm, "beta1", self._beta1,
+                                 "beta2", self._beta2, "epsilon", self._epsilon)
         else:
             # create the adamax optimize op
             adamax_op = block.append_op(
@@ -2862,10 +2861,11 @@ def _finish_update(self, block, parameters_and_grads):
                                                       param)
                 if framework._non_static_mode():
                     if framework.in_dygraph_mode():
-                        tmp = _C_ops.final_state_scale(beta1_pow_acc,
-                                                       self._beta1, 0.0, True)
+                        tmp = _C_ops.scale(beta1_pow_acc, self._beta1, 0.0,
+                                           True)
                     else:
-                        tmp = _C_ops.scale(beta1_pow_acc, "scale", self._beta1)
+                        tmp = _legacy_C_ops.scale(beta1_pow_acc, "scale",
+                                                  self._beta1)
                     beta1_pow_acc.copy_(tmp, False)
                 else:
                     block.append_op(type="scale",
@@ -2952,11 +2952,11 @@ def _append_optimize_op(self, block, param_and_grad):
             self._seed = 0
 
         if framework._non_static_mode():
-            _C_ops.dpsgd(param_and_grad[0], param_and_grad[1],
-                         self._create_param_lr(param_and_grad),
-                         param_and_grad[0], "clip", self._clip, "batch_size",
-                         self._batch_size, "sigma", self._sigma, "seed",
-                         self._seed)
+            _legacy_C_ops.dpsgd(param_and_grad[0], param_and_grad[1],
+                                self._create_param_lr(param_and_grad),
+                                param_and_grad[0], "clip", self._clip,
+                                "batch_size", self._batch_size, "sigma",
+                                self._sigma, "seed", self._seed)
         else:
             dpsgd_op = block.append_op(type=self.type,
                                        inputs={
@@ -3072,11 +3072,12 @@ def _append_optimize_op(self, block, param_and_grad):
                                            param_and_grad[0])
 
         if framework._non_static_mode():
-            _C_ops.decayed_adagrad(param_and_grad[0], param_and_grad[1],
-                                   moment_acc,
-                                   self._create_param_lr(param_and_grad),
-                                   param_and_grad[0], moment_acc, "epsilon",
-                                   self._epsilon, "decay", self._decay)
+            _legacy_C_ops.decayed_adagrad(param_and_grad[0], param_and_grad[1],
+                                          moment_acc,
+                                          self._create_param_lr(param_and_grad),
+                                          param_and_grad[0], moment_acc,
+                                          "epsilon", self._epsilon, "decay",
+                                          self._decay)
         else:
             # Create the decayed adagrad optimizer op
             decayed_adagrad_op = block.append_op(
@@ -3198,16 +3199,15 @@ def _append_optimize_op(self, block, param_and_grad):
             self._avg_squared_update_acc_str, param_and_grad[0])
 
         if framework.in_dygraph_mode():
-            _C_ops.final_state_adadelta_(param_and_grad[0], param_and_grad[1],
-                                         avg_squared_grad_acc,
-                                         avg_squared_update_acc, self._rho,
-                                         self._epsilon)
+            _C_ops.adadelta_(param_and_grad[0], param_and_grad[1],
+                             avg_squared_grad_acc, avg_squared_update_acc,
+                             self._rho, self._epsilon)
         elif framework._in_legacy_dygraph():
-            _C_ops.adadelta(param_and_grad[0], param_and_grad[1],
-                            avg_squared_grad_acc, avg_squared_update_acc,
-                            param_and_grad[0], avg_squared_grad_acc,
-                            avg_squared_update_acc, "epsilon", self._epsilon,
-                            "rho", self._rho)
+            _legacy_C_ops.adadelta(param_and_grad[0], param_and_grad[1],
+                                   avg_squared_grad_acc, avg_squared_update_acc,
+                                   param_and_grad[0], avg_squared_grad_acc,
+                                   avg_squared_update_acc, "epsilon",
+                                   self._epsilon, "rho", self._rho)
         else:
             # Create the adadelta optimizer op
             adadelta_op = block.append_op(type=self.type,
@@ -3399,20 +3399,20 @@ def _append_optimize_op(self, block, param_and_grad):
         mean_grad_acc = self._get_accumulator(self._mean_grad_acc_str,
                                               param_and_grad[0])
         if in_dygraph_mode():
-            _C_ops.final_state_rmsprop_(param_and_grad[0], mean_square_acc,
-                                        param_and_grad[1], momentum_acc,
-                                        self._create_param_lr(param_and_grad),
-                                        mean_grad_acc, self._epsilon, self._rho,
-                                        self._momentum, self._centered)
+            _C_ops.rmsprop_(param_and_grad[0], mean_square_acc,
+                            param_and_grad[1], momentum_acc,
+                            self._create_param_lr(param_and_grad),
+                            mean_grad_acc, self._epsilon, self._rho,
+                            self._momentum, self._centered)
             return None
         elif _in_legacy_dygraph():
-            _C_ops.rmsprop(param_and_grad[0], mean_square_acc,
-                           self._create_param_lr(param_and_grad),
-                           param_and_grad[1], momentum_acc, param_and_grad[0],
-                           momentum_acc, mean_square_acc, mean_grad_acc,
-                           "epsilon", self._epsilon, "decay", self._rho,
-                           "momentum", self._momentum, "centered",
-                           self._centered)
+            _legacy_C_ops.rmsprop(param_and_grad[0], mean_square_acc,
+                                  self._create_param_lr(param_and_grad),
+                                  param_and_grad[1], momentum_acc,
+                                  param_and_grad[0], momentum_acc,
+                                  mean_square_acc, mean_grad_acc, "epsilon",
+                                  self._epsilon, "decay", self._rho, "momentum",
+                                  self._momentum, "centered", self._centered)
             return None
         else:
             rmsprop_op = block.append_op(
@@ -3579,11 +3579,12 @@ def _append_optimize_op(self, block, param_and_grad):
         linear_acc = self._get_accumulator(self._linear_acc_str,
                                            param_and_grad[0])
         if framework._non_static_mode():
-            _C_ops.ftrl(param_and_grad[0], squared_acc,
-                        linear_acc, param_and_grad[1],
-                        self._create_param_lr(param_and_grad),
-                        param_and_grad[0], squared_acc, linear_acc, "l1",
-                        self._l1, "l2", self._l2, "lr_power", self._lr_power)
+            _legacy_C_ops.ftrl(param_and_grad[0], squared_acc, linear_acc,
+                               param_and_grad[1],
+                               self._create_param_lr(param_and_grad),
+                               param_and_grad[0], squared_acc, linear_acc, "l1",
+                               self._l1, "l2", self._l2, "lr_power",
+                               self._lr_power)
 
         else:
             ftrl_op = block.append_op(type=self.type,
@@ -3741,12 +3742,13 @@ def _append_optimize_op(self, block, param_and_grad):
         lr = self._create_param_lr(param_and_grad)
         master_weight = None
         if framework._non_static_mode():
-            _C_ops.lamb(param_and_grad[0], param_and_grad[1], lr, moment1,
-                        moment2, beta1_pow_acc, beta2_pow_acc, master_weight,
-                        param_and_grad[0], moment1, moment2, beta1_pow_acc,
-                        beta2_pow_acc, master_weight, 'beta1', self._beta1,
-                        'beta2', self._beta2, 'epsilon', self._epsilon,
-                        'weight_decay', weight_decay)
+            _legacy_C_ops.lamb(param_and_grad[0], param_and_grad[1], lr,
+                               moment1, moment2, beta1_pow_acc, beta2_pow_acc,
+                               master_weight, param_and_grad[0], moment1,
+                               moment2, beta1_pow_acc, beta2_pow_acc,
+                               master_weight, 'beta1', self._beta1, 'beta2',
+                               self._beta2, 'epsilon', self._epsilon,
+                               'weight_decay', weight_decay)
             return None
 
         # create the lamb optimize op
diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py
index 4328d824071f4..1c7b3558753de 100644
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
@@ -18,7 +18,7 @@
 from . import framework
 from .framework import _non_static_mode, _varbase_creator, in_dygraph_mode
 from . import core
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 __all__ = ['L1Decay', 'L2Decay', 'L1DecayRegularizer', 'L2DecayRegularizer']
 
@@ -135,11 +135,11 @@ def __call__(self, param, grad, block):
 
         if framework._non_static_mode():
             if framework.in_dygraph_mode():
-                return _C_ops.final_state_scale(param,
-                                                self._regularization_coeff, 0.0,
-                                                True)
+                return _C_ops.scale(param, self._regularization_coeff, 0.0,
+                                    True)
             else:
-                return _C_ops.scale(param, "scale", self._regularization_coeff)
+                return _legacy_C_ops.scale(param, "scale",
+                                           self._regularization_coeff)
         else:
             decay = block.create_var(dtype=param.dtype,
                                      shape=param.shape,
@@ -253,9 +253,8 @@ def __call__(self, param, grad, block):
                                      shape=param.shape,
                                      lod_level=param.lod_level)
         if in_dygraph_mode():
-            sign = _C_ops.final_state_sign(param)
-            return _C_ops.final_state_scale(sign, self._regularization_coeff,
-                                            0.0, True)
+            sign = _C_ops.sign(param)
+            return _C_ops.scale(sign, self._regularization_coeff, 0.0, True)
 
         # Append sign op
         block.append_op(type='sign', inputs={"X": param}, outputs={"Out": sign})
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 0bdd3183c9259..d1b82387f9dfc 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -7,20 +7,11 @@ set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0 FLAGS_fast_eager_deletion_mode=1
             FLAGS_memory_fraction_of_eager_deletion=1.0)
 set(dist_ENVS http_proxy="" https_proxy="")
 
-file(
-  GLOB MULTINODE_DIST_TEST_OPS
-  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
-  "test_multinode_*.py")
-string(REPLACE ".py" "" MULTINODE_DIST_TEST_OPS "${MULTINODE_DIST_TEST_OPS}")
-
 file(
   GLOB DIST_TEST_OPS
   RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
   "test_dist_*.py")
 list(REMOVE_ITEM DIST_TEST_OPS "test_dist_op")
-if((NOT WITH_NCCL) AND (NOT WITH_RCCL))
-  list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_dgc_nccl")
-endif()
 
 string(REPLACE ".py" "" DIST_TEST_OPS "${DIST_TEST_OPS}")
 
@@ -32,75 +23,22 @@ if((NOT WITH_GPU)
   list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_ps_gpu_ctr")
   list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_batch_merge")
 endif()
-
-list(APPEND DIST_TEST_OPS test_parallel_dygraph_mnist)
-list(APPEND DIST_TEST_OPS test_pipeline)
-list(APPEND DIST_TEST_OPS test_ir_pass_pipeline)
-list(APPEND DIST_TEST_OPS test_static_model_parallel)
+list(APPEND DIST_TEST_OPS test_parallel_dygraph_dataparallel)
 list(APPEND DIST_TEST_OPS test_static_model_parallel_fused_feedforward)
 list(APPEND DIST_TEST_OPS test_static_model_parallel_fused_attention)
 list(APPEND DIST_TEST_OPS test_static_model_parallel_fused_multi_transformer)
-list(APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext)
-list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding)
-list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding_over_height)
-list(APPEND DIST_TEST_OPS test_parallel_dygraph_transformer)
-if(WITH_GPU
-   OR WITH_XPU
-   OR WITH_ASCEND
-   OR WITH_ASCEND_CL)
-  list(APPEND DIST_TEST_OPS test_fleet_graph_execution_meta_optimizer)
-  list(APPEND DIST_TEST_OPS test_fleet_pipeline_meta_optimizer)
-  list(APPEND DIST_TEST_OPS test_fleet_pipeline_meta_optimizer_with_recompute)
-  list(APPEND DIST_TEST_OPS test_fleet_raw_program_meta_optimizer)
-  list(APPEND DIST_TEST_OPS test_gen_nccl_id_op)
-  list(APPEND DIST_TEST_OPS test_rnn_dp)
-endif()
-list(APPEND DIST_TEST_OPS test_parallel_dygraph_unused_variables)
-list(APPEND DIST_TEST_OPS test_parallel_dygraph_control_flow)
-list(APPEND DIST_TEST_OPS test_parallel_dygraph_no_sync)
-list(APPEND DIST_TEST_OPS test_parallel_dygraph_no_sync_gradient_check)
-list(APPEND DIST_TEST_OPS test_parallel_dygraph_dataparallel)
-list(APPEND DIST_TEST_OPS test_parallel_dygraph_pipeline_parallel)
-list(APPEND DIST_TEST_OPS test_parallel_dygraph_tensor_parallel)
-list(APPEND DIST_TEST_OPS test_parallel_dygraph_sharding_parallel)
-list(APPEND DIST_TEST_OPS test_dygraph_sharding_optimizer_stage2)
-list(APPEND DIST_TEST_OPS test_dygraph_sharding_stage2)
-list(APPEND DIST_TEST_OPS test_dygraph_sharding_stage3)
-list(APPEND DIST_TEST_OPS test_dygraph_sharding_stage3_for_eager)
-list(APPEND DIST_TEST_OPS test_dygraph_group_sharded_api)
-list(APPEND DIST_TEST_OPS test_dygraph_group_sharded_api_for_eager)
-list(APPEND DIST_TEST_OPS test_auto_parallel_parallelizer)
-list(APPEND DIST_TEST_OPS test_parallel_dygraph_mp_layers)
-list(APPEND DIST_TEST_OPS test_hybrid_parallel_inference_helper)
-list(APPEND DIST_TEST_OPS test_parallel_class_center_sample)
-list(APPEND DIST_TEST_OPS test_parallel_margin_cross_entropy)
+list(APPEND DIST_TEST_OPS
+     test_parallel_dygraph_pipeline_parallel_with_virtual_stage)
 list(APPEND DIST_TEST_OPS test_auto_parallel_data_unshard)
 list(APPEND DIST_TEST_OPS test_auto_parallel_save_load)
 list(APPEND DIST_TEST_OPS test_auto_parallel_autoconvert)
-list(APPEND DIST_TEST_OPS test_collective_process_group)
-list(APPEND DIST_TEST_OPS test_collective_alltoall_single)
-list(APPEND DIST_TEST_OPS test_eager_dist_api)
-list(APPEND DIST_TEST_OPS test_collective_batch_isend_irecv)
-list(APPEND DIST_TEST_OPS test_collective_reduce_scatter)
-list(APPEND DIST_TEST_OPS test_parallel_dygraph_qat)
 set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS})
 #remove distribute unittests.
 
-foreach(TEST_OP ${MULTINODE_DIST_TEST_OPS})
-  list(APPEND MIXED_DIST_TEST_OPS ${TEST_OP})
-endforeach()
-
-list(APPEND MIXED_DIST_TEST_OPS test_dgc_op)
-list(APPEND MIXED_DIST_TEST_OPS test_dgc_momentum_op)
-list(APPEND MIXED_DIST_TEST_OPS test_dgc_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_simple_dist_transpiler)
-list(APPEND MIXED_DIST_TEST_OPS test_recv_save_op)
-list(APPEND MIXED_DIST_TEST_OPS test_c_comm_init_op)
 list(APPEND MIXED_DIST_TEST_OPS test_communicator_async)
 list(APPEND MIXED_DIST_TEST_OPS test_communicator_ps_gpu)
 list(APPEND MIXED_DIST_TEST_OPS test_communicator_geo)
-list(APPEND MIXED_DIST_TEST_OPS test_communicator_half_async)
-list(APPEND MIXED_DIST_TEST_OPS test_communicator_sync)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_ps)
 list(APPEND MIXED_DIST_TEST_OPS test_launch_coverage)
 list(APPEND MIXED_DIST_TEST_OPS test_fleetrun)
@@ -111,32 +49,11 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_ascend)
 list(APPEND MIXED_DIST_TEST_OPS test_ascend_group)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_nproc)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_api_input)
-list(APPEND MIXED_DIST_TEST_OPS test_collective_optimizer)
+
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_base)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_base_2)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_base_3)
-list(APPEND MIXED_DIST_TEST_OPS test_fleet_recompute_meta_optimizer)
-list(APPEND MIXED_DIST_TEST_OPS test_fleet_pipeline_meta_optimizer)
-list(APPEND MIXED_DIST_TEST_OPS
-     test_fleet_pipeline_meta_optimizer_with_recompute)
-list(APPEND MIXED_DIST_TEST_OPS test_fleet_raw_program_meta_optimizer)
-list(APPEND MIXED_DIST_TEST_OPS test_rnn_dp)
-list(APPEND MIXED_DIST_TEST_OPS test_fleet_amp_meta_optimizer)
-list(APPEND MIXED_DIST_TEST_OPS test_fleet_amp_init)
-list(APPEND MIXED_DIST_TEST_OPS test_fleet_gradient_merge_meta_optimizer)
-list(APPEND MIXED_DIST_TEST_OPS test_fleet_sharding_meta_optimizer)
-list(APPEND MIXED_DIST_TEST_OPS test_fleet_hybrid_meta_optimizer)
-list(APPEND MIXED_DIST_TEST_OPS test_fleet_localsgd_meta_optimizer)
-list(APPEND MIXED_DIST_TEST_OPS test_fleet_lars_meta_optimizer)
-list(APPEND MIXED_DIST_TEST_OPS test_fleet_lamb_meta_optimizer)
-list(APPEND MIXED_DIST_TEST_OPS test_fleet_dgc_meta_optimizer)
-list(APPEND MIXED_DIST_TEST_OPS test_fleet_fp16_allreduce_meta_optimizer)
-list(APPEND MIXED_DIST_TEST_OPS test_fleet_private_function)
-list(APPEND MIXED_DIST_TEST_OPS test_fleet_graph_executor)
-list(APPEND MIXED_DIST_TEST_OPS test_fleet_meta_optimizer_base)
-list(APPEND MIXED_DIST_TEST_OPS test_fleet_distributed_strategy)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_auto)
-list(APPEND MIXED_DIST_TEST_OPS test_fleet_static_mp_layers)
 list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_partitioner)
 list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_partitioner_gpt)
 list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_searcher)
@@ -146,7 +63,6 @@ list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard_serial)
 list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard_mppp)
 list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard_dpmppp)
 list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_cost_model)
-list(APPEND MIXED_DIST_TEST_OPS test_tcp_store)
 list(APPEND MIXED_DIST_TEST_OPS test_dygraph_hybrid_dp)
 foreach(TEST_OP ${MIXED_DIST_TEST_OPS})
   list(REMOVE_ITEM TEST_OPS ${TEST_OP})
@@ -172,30 +88,8 @@ list(REMOVE_ITEM TEST_OPS test_fuse_gemm_epilogue_pass)
 
 if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
   list(REMOVE_ITEM TEST_OPS test_c_comm_init_all_op)
-  list(REMOVE_ITEM TEST_OPS test_allgather)
   list(REMOVE_ITEM TEST_OPS test_c_embedding_op)
-  list(REMOVE_ITEM TEST_OPS test_collective_reduce)
   list(REMOVE_ITEM TEST_OPS test_pipeline_parallel)
-  list(REMOVE_ITEM TEST_OPS test_collective_scatter)
-  list(REMOVE_ITEM TEST_OPS test_collective_sendrecv)
-  list(REMOVE_ITEM TEST_OPS test_reducescatter)
-  list(REMOVE_ITEM TEST_OPS test_reducescatter_api)
-  list(REMOVE_ITEM TEST_OPS test_collective_split_embedding_none_divisible)
-  list(REMOVE_ITEM TEST_OPS test_collective_split_row_linear)
-  list(REMOVE_ITEM TEST_OPS test_collective_split_col_linear)
-  list(REMOVE_ITEM TEST_OPS test_collective_reduce_api)
-  list(REMOVE_ITEM TEST_OPS test_collective_scatter_api)
-  list(REMOVE_ITEM TEST_OPS test_collective_barrier_api)
-  list(REMOVE_ITEM TEST_OPS test_collective_allreduce_api)
-  list(REMOVE_ITEM TEST_OPS test_new_group_api)
-  list(REMOVE_ITEM TEST_OPS test_collective_broadcast_api)
-  list(REMOVE_ITEM TEST_OPS test_collective_allgather_api)
-  list(REMOVE_ITEM TEST_OPS test_collective_allgather_object_api)
-  list(REMOVE_ITEM TEST_OPS test_collective_alltoall_api)
-  list(REMOVE_ITEM TEST_OPS test_collective_global_gather)
-  list(REMOVE_ITEM TEST_OPS test_collective_global_scatter)
-  list(REMOVE_ITEM TEST_OPS test_collective_sendrecv_api)
-  list(REMOVE_ITEM TEST_OPS test_collective_wait)
   list(REMOVE_ITEM TEST_OPS test_memcpy_op)
   list(REMOVE_ITEM TEST_OPS test_raw_program_optimizer)
   list(REMOVE_ITEM TEST_OPS test_fleet_gradient_scale)
@@ -210,7 +104,6 @@ if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
   list(REMOVE_ITEM TEST_OPS test_fleet_exe_dist_model_tensor)
 endif()
 
-# Temporally disable test_deprecated_decorator
 list(REMOVE_ITEM TEST_OPS test_deprecated_decorator)
 
 if(WIN32)
@@ -237,19 +130,15 @@ endif()
 if(NOT WITH_DISTRIBUTE OR WIN32)
   # DISTRIBUTE related
   list(REMOVE_ITEM TEST_OPS test_avoid_twice_initialization)
-  list(REMOVE_ITEM TEST_OPS test_distributed_strategy)
   list(REMOVE_ITEM TEST_OPS test_fleet_metric)
   list(REMOVE_ITEM TEST_OPS test_fleet_ps)
   list(REMOVE_ITEM TEST_OPS test_fleet_rolemaker_2)
-  list(REMOVE_ITEM TEST_OPS test_fleet_utils)
-  list(REMOVE_ITEM TEST_OPS test_collective_cpu_barrier_with_gloo)
   list(REMOVE_ITEM TEST_OPS test_delete_c_identity_op_pass)
   # TODO: Fix these unittests failed on Windows
   list(REMOVE_ITEM TEST_OPS test_fake_init_op)
 endif()
 
 if(NOT WITH_DISTRIBUTE)
-  list(REMOVE_ITEM TEST_OPS test_fleet_rolemaker_new)
   list(REMOVE_ITEM TEST_OPS test_desc_clone_dist)
 endif()
 
@@ -278,8 +167,8 @@ endif()
 
 list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_hybrid_parallel)
 
-list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_transformer_gloo
-)# NOTE: @xiongkun03, cpu is too slow, fix it in next PR
+list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_transformer_gloo)
+# NOTE: @xiongkun03, cpu is too slow, fix it in next PR
 
 if(NOT WITH_GLOO)
   list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_dataparallel_cpuonly)
@@ -294,42 +183,15 @@ endif()
 
 if((NOT WITH_GPU) AND (NOT WITH_ROCM))
   list(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
-  list(REMOVE_ITEM TEST_OPS test_rank_attention_op
-  )# TODO(shenliang03): rank_attention_op support CPU device in future
-  list(REMOVE_ITEM TEST_OPS test_batch_fc_op
-  )# TODO(shenliang03): batch_fc_op support CPU device in future
-  list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_mnist
-  )# TODO(Yancey1989): parallel dygraph support CPU device in future
-  list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_unused_variables)
-  list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_se_resnext)
-  list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sparse_embedding)
-  list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sparse_embedding_over_height)
-  list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_transformer)
-  list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sync_batch_norm)
-  list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_control_flow)
-  list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_no_sync)
-  list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_no_sync_gradient_check)
+  list(REMOVE_ITEM TEST_OPS test_rank_attention_op)
+  # TODO(shenliang03): rank_attention_op support CPU device in future
+  list(REMOVE_ITEM TEST_OPS test_batch_fc_op)
+  # TODO(shenliang03): batch_fc_op support CPU device in future
+  # TODO(Yancey1989): parallel dygraph support CPU device in future
   list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_dataparallel)
-  list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_pipeline_parallel)
-  list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_tensor_parallel)
-  list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sharding_parallel)
-  list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_optimizer_stage2)
-  list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage2)
-  list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage3)
-  list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage3_for_eager)
-  list(REMOVE_ITEM TEST_OPS test_dygraph_group_sharded_api)
-  list(REMOVE_ITEM TEST_OPS test_dygraph_group_sharded_api_for_eager)
-  list(REMOVE_ITEM TEST_OPS test_auto_parallel_parallelizer)
-  list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_mp_layers)
-  list(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision)
-  list(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision_for_eager)
-  list(REMOVE_ITEM TEST_OPS test_mixed_precision)
+  list(REMOVE_ITEM TEST_OPS
+       test_parallel_dygraph_pipeline_parallel_with_virtual_stage)
   list(REMOVE_ITEM TEST_OPS test_fleet_base_single)
-  list(REMOVE_ITEM TEST_OPS test_dygraph_recompute)
-  list(REMOVE_ITEM TEST_OPS test_dygraph_recompute_for_eager)
-  list(REMOVE_ITEM TEST_OPS test_hybrid_parallel_inference_helper)
-  list(REMOVE_ITEM TEST_OPS test_parallel_class_center_sample)
-  list(REMOVE_ITEM TEST_OPS test_parallel_margin_cross_entropy)
   list(REMOVE_ITEM TEST_OPS test_auto_parallel_partitioner)
   list(REMOVE_ITEM TEST_OPS test_auto_parallel_partitioner_gpt)
   list(REMOVE_ITEM TEST_OPS test_auto_parallel_searcher)
@@ -342,12 +204,6 @@ if((NOT WITH_GPU) AND (NOT WITH_ROCM))
   list(REMOVE_ITEM TEST_OPS test_auto_parallel_data_unshard)
   list(REMOVE_ITEM TEST_OPS test_auto_parallel_save_load)
   list(REMOVE_ITEM TEST_OPS test_auto_parallel_autoconvert)
-  list(REMOVE_ITEM TEST_OPS test_collective_process_group)
-  list(REMOVE_ITEM TEST_OPS test_collective_alltoall_single)
-  list(REMOVE_ITEM TEST_OPS test_eager_dist_api)
-  list(REMOVE_ITEM TEST_OPS test_collective_batch_isend_irecv)
-  list(REMOVE_ITEM TEST_OPS test_collective_reduce_scatter)
-  list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_qat)
 
 elseif(WITH_GPU)
   if(${CUDNN_VERSION} VERSION_LESS 7100)
@@ -355,35 +211,28 @@ elseif(WITH_GPU)
   endif()
 endif()
 
-if(WITH_NCCL)
-  if(${NCCL_VERSION} VERSION_LESS 2212)
-    list(REMOVE_ITEM DIST_TEST_OPS test_parallel_dygraph_sparse_embedding)
-    list(REMOVE_ITEM DIST_TEST_OPS
-         test_parallel_dygraph_sparse_embedding_over_height)
-    list(REMOVE_ITEM DIST_TEST_OPS test_parallel_dygraph_transformer)
-  endif()
-endif()
-
 if((NOT WITH_NCCL) AND (NOT WITH_RCCL))
   list(REMOVE_ITEM TEST_OPS test_imperative_group)
-  list(REMOVE_ITEM TEST_OPS test_new_group_api)
 endif()
 
 if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
   list(REMOVE_ITEM TEST_OPS test_fused_gate_attention_op)
   list(REMOVE_ITEM TEST_OPS test_boxps)
+  list(REMOVE_ITEM TEST_OPS test_allgather)
+  list(REMOVE_ITEM TEST_OPS test_reducescatter)
+  list(REMOVE_ITEM TEST_OPS test_reducescatter_api)
 endif()
-list(REMOVE_ITEM TEST_OPS test_seq_concat_op
-)# FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
-list(REMOVE_ITEM TEST_OPS test_lstm_unit_op
-)# # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185
+list(REMOVE_ITEM TEST_OPS test_seq_concat_op)
+# FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
+list(REMOVE_ITEM TEST_OPS test_lstm_unit_op)
+# # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185
 list(REMOVE_ITEM TEST_OPS test_cond_op)
 
 # FIXME(qijun): https://github.com/PaddlePaddle/Paddle/issues/5101#issuecomment-339814957
 
 list(REMOVE_ITEM TEST_OPS op_test) # op_test is a helper python file, not a test
-list(REMOVE_ITEM TEST_OPS decorator_helper
-)# decorator_helper is a helper python file, not a test
+list(REMOVE_ITEM TEST_OPS decorator_helper)
+# decorator_helper is a helper python file, not a test
 
 if(APPLE)
   if(NOT WITH_DISTRIBUTE)
@@ -619,8 +468,6 @@ list(REMOVE_ITEM TEST_OPS test_fuse_bn_act_pass)
 list(REMOVE_ITEM TEST_OPS test_fuse_bn_add_act_pass)
 list(REMOVE_ITEM TEST_OPS test_imperative_static_runner_mnist)
 list(REMOVE_ITEM TEST_OPS test_imperative_static_runner_while)
-# disable test_cumsum_op temporaily
-# list(REMOVE_ITEM TEST_OPS test_cumsum_op)
 
 # disable this unittest temporarily
 list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_exception)
@@ -637,7 +484,6 @@ if(APPLE OR WIN32)
   list(REMOVE_ITEM TEST_OPS test_dataset)
   list(REMOVE_ITEM TEST_OPS test_dataset_dataloader)
   list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_base)
-  # list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_exception)
   list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_process)
   list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_fds_clear)
   list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_exit_func)
@@ -696,7 +542,6 @@ if((NOT WITH_GPU)
    AND (NOT WITH_XPU)
    AND NOT (WITH_ASCEND OR WITH_ASCEND_CL))
   list(REMOVE_ITEM TEST_OPS "test_fleet_graph_execution_meta_optimizer")
-  list(REMOVE_ITEM TEST_OPS "test_gen_nccl_id_op")
   list(REMOVE_ITEM TEST_OPS "test_dist_fleet_grad_clip")
   list(REMOVE_ITEM TEST_OPS "test_dist_fleet_heter_ctr")
   list(REMOVE_ITEM TEST_OPS "test_dist_fleet_ps_gpu_ctr")
@@ -810,67 +655,16 @@ if(WITH_DISTRIBUTE)
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_ps12")
   endif()
 
-  py_test_modules(test_recv_save_op MODULES test_recv_save_op ENVS ${dist_ENVS})
   py_test_modules(test_communicator_async MODULES test_communicator_async ENVS
                   ${dist_ENVS})
-  # py_test_modules(test_communicator_ps_gpu MODULES test_communicator_ps_gpu
-  #                 ENVS ${dist_ENVS})
   py_test_modules(test_communicator_geo MODULES test_communicator_geo ENVS
                   ${dist_ENVS})
-  py_test_modules(
-    test_communicator_half_async
-    MODULES
-    test_communicator_half_async
-    ENVS
-    ${dist_ENVS}
-    FLAGS_communicator_send_queue_size=1
-    FLAGS_communicator_max_merge_var_num=1)
-  py_test_modules(
-    test_communicator_sync
-    MODULES
-    test_communicator_sync
-    ENVS
-    ${dist_ENVS}
-    FLAGS_communicator_send_queue_size=1
-    FLAGS_communicator_max_merge_var_num=1)
-  py_test_modules(test_collective_optimizer MODULES test_collective_optimizer)
   if(NOT APPLE)
     py_test_modules(test_fleet_base MODULES test_fleet_base ENVS ${dist_ENVS})
     py_test_modules(test_fleet_base_2 MODULES test_fleet_base_2 ENVS
                     ${dist_ENVS})
     py_test_modules(test_fleet_base_3 MODULES test_fleet_base_3 ENVS
                     ${dist_ENVS})
-    py_test_modules(test_fleet_amp_init MODULES test_fleet_amp_init ENVS
-                    ${dist_ENVS})
-    py_test_modules(test_fleet_fp16_allreduce_meta_optimizer MODULES
-                    test_fleet_fp16_allreduce_meta_optimizer ENVS ${dist_ENVS})
-    py_test_modules(test_fleet_private_function MODULES
-                    test_fleet_private_function ENVS ${dist_ENVS})
-    py_test_modules(test_fleet_meta_optimizer_base MODULES
-                    test_fleet_meta_optimizer_base ENVS ${dist_ENVS})
-    py_test_modules(test_fleet_distributed_strategy MODULES
-                    test_fleet_distributed_strategy)
-    py_test_modules(test_fleet_static_mp_layers MODULES
-                    test_fleet_static_mp_layers)
-    #py_test_modules(test_fleet_auto MODULES test_fleet_auto ENVS ${dist_ENVS})
-    if(WITH_GPU
-       OR WITH_XPU
-       OR WITH_ASCEND
-       OR WITH_ASCEND_CL)
-      py_test_modules(test_fleet_amp_meta_optimizer MODULES
-                      test_fleet_amp_meta_optimizer ENVS ${dist_ENVS})
-      py_test_modules(
-        test_fleet_gradient_merge_meta_optimizer MODULES
-        test_fleet_gradient_merge_meta_optimizer ENVS ${dist_ENVS})
-      py_test_modules(test_fleet_graph_executor MODULES
-                      test_fleet_graph_executor ENVS ${dist_ENVS})
-      py_test_modules(test_fleet_hybrid_meta_optimizer MODULES
-                      test_fleet_hybrid_meta_optimizer ENVS ${dist_ENVS})
-      py_test_modules(test_fleet_recompute_meta_optimizer MODULES
-                      test_fleet_recompute_meta_optimizer ENVS ${dist_ENVS})
-      py_test_modules(test_fleet_sharding_meta_optimizer MODULES
-                      test_fleet_sharding_meta_optimizer ENVS ${dist_ENVS})
-    endif()
     if(NOT WIN32)
       py_test_modules(test_auto_parallel_partitioner MODULES
                       test_auto_parallel_partitioner ENVS ${dist_ENVS})
@@ -891,52 +685,16 @@ if(WITH_DISTRIBUTE)
       py_test_modules(test_auto_parallel_cost_model MODULES
                       test_auto_parallel_cost_model ENVS ${dist_ENVS})
 
-      if(WITH_GPU
-         OR WITH_XPU
-         OR WITH_ASCEND
-         OR WITH_ASCEND_CL)
-        py_test_modules(test_fleet_lamb_meta_optimizer MODULES
-                        test_fleet_lamb_meta_optimizer ENVS ${dist_ENVS})
-        py_test_modules(test_fleet_lars_meta_optimizer MODULES
-                        test_fleet_lars_meta_optimizer ENVS ${dist_ENVS})
-        py_test_modules(test_fleet_localsgd_meta_optimizer MODULES
-                        test_fleet_localsgd_meta_optimizer ENVS ${dist_ENVS})
-
-      endif()
     endif()
   endif()
-  if(WITH_DGC)
-    # if with dgc, test all dgc tests.
-    # NOTE. dist dgc tests is already in DIST_TEST_OPS
-    py_test_modules(test_dgc_op MODULES test_dgc_op)
-    py_test_modules(test_dgc_momentum_op MODULES test_dgc_momentum_op)
-    py_test_modules(test_dgc_optimizer MODULES test_dgc_optimizer)
-    py_test_modules(test_fleet_dgc_meta_optimizer MODULES
-                    test_fleet_dgc_meta_optimizer)
-  else()
+  if(NOT WITH_DGC)
     # if not with dgc, must close all dgc tests
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_dgc_nccl")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_se_resnext_dgc")
   endif()
 
-  # port range (20000, 23000) is reserved for dist-ops
-  set(dist_ut_port 20001)
-  if(NOT WIN32)
-    bash_test_modules(
-      test_tcp_store
-      START_BASH
-      dist_test.sh
-      LABELS
-      "RUN_TYPE=EXCLUSIVE"
-      ENVS
-      "PADDLE_DIST_UT_PORT=${dist_ut_port}")
-    math(EXPR dist_ut_port "${dist_ut_port}+1")
-  endif()
-
   if(NOT APPLE)
     if(WITH_GPU OR WITH_ROCM)
-      bash_test_modules(test_c_comm_init_op START_BASH test_c_comm_init_op.sh
-                        ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
       py_test_modules(test_launch_coverage MODULES test_launch_coverage)
     endif()
 
@@ -967,22 +725,8 @@ if(WITH_DISTRIBUTE)
                         PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
     endif()
 
-    # add new dist test
-    if(WITH_DISTRIBUTE AND WITH_MULTINODE_TESTING)
-      foreach(TEST_OP ${MULTINODE_DIST_TEST_OPS})
-        bash_test_modules(
-          ${TEST_OP}
-          START_BASH
-          multinode_dist_test.sh
-          LABELS
-          "RUN_TYPE=EXCLUSIVE"
-          ENVS
-          "PADDLE_DIST_UT_PORT=${dist_ut_port}")
-      endforeach()
-
-    endif()
-
-    # port range (20000, 23000) is reserved for dist-ops
+    # port range (20000, 21200) is reserved for dist-ops
+    set(dist_ut_port 20001)
     foreach(TEST_OP ${DIST_TEST_OPS})
       bash_test_modules(
         ${TEST_OP}
@@ -992,8 +736,8 @@ if(WITH_DISTRIBUTE)
         "RUN_TYPE=EXCLUSIVE"
         ENVS
         "PADDLE_DIST_UT_PORT=${dist_ut_port}")
-      math(EXPR dist_ut_port "${dist_ut_port}+20")
-      if(dist_ut_port GREATER_EQUAL 22998)
+      math(EXPR dist_ut_port "${dist_ut_port}+10")
+      if(dist_ut_port GREATER_EQUAL 21198)
         message(
           FATAL_ERROR "available ports have been exhausted:${dist_ut_port}")
       endif()
@@ -1019,20 +763,6 @@ if(WITH_DISTRIBUTE)
         "PADDLE_DIST_UT_PORT=${dist_ut_port}"
         PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
     endif()
-    if(WITH_GPU
-       OR WITH_XPU
-       OR WITH_ASCEND
-       OR WITH_ASCEND_CL)
-      bash_test_modules(
-        test_new_group
-        START_BASH
-        test_new_group.sh
-        LABELS
-        "RUN_TYPE=EXCLUSIVE"
-        ENVS
-        "PADDLE_DIST_UT_PORT=${dist_ut_port}+20"
-        PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
-    endif()
   endif()
 endif()
 
@@ -1226,9 +956,8 @@ endif()
 
 # dist xpu tests:
 if(WITH_XPU_BKCL)
-  #py_test(test_collective_reduce_api_xpu SRCS "test_collective_reduce_api.py")
   py_test(test_collective_allreduce_api_xpu
-          SRCS "test_collective_allreduce_api.py")
+          SRCS "collective/test_collective_allreduce_api.py")
 endif()
 
 if(WITH_HETERPS)
@@ -1285,8 +1014,6 @@ set_tests_properties(
   test_fetch_unmerged
   test_buffer_shared_memory_reuse_pass
   PROPERTIES LABELS "RUN_TYPE=DIST")
-# disable test_parallel_executor_fetch_isolated_var
-# set_tests_properties(test_parallel_executor_fetch_isolated_var PROPERTIES LABELS "RUN_TYPE=DIST")
 set_tests_properties(
   test_parallel_executor_crf
   test_sync_batch_norm_op
@@ -1307,7 +1034,6 @@ if(NOT WIN32 AND NOT APPLE)
                        PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
   set_tests_properties(test_imperative_data_loader_fds_clear
                        PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
-  # set_tests_properties(test_imperative_data_loader_exception PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
   set_tests_properties(test_multiprocess_dataloader_static
                        PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
   set_tests_properties(test_multiprocess_dataloader_dynamic
@@ -1337,32 +1063,16 @@ if(NOT WIN32)
   endif()
 endif()
 
-if(WITH_DISTRIBUTE AND NOT WIN32)
-  set_tests_properties(test_fleet_utils PROPERTIES TIMEOUT 120)
-  set_tests_properties(test_collective_cpu_barrier_with_gloo PROPERTIES TIMEOUT
-                                                                        40)
-endif()
-
 if(WITH_DISTRIBUTE)
-  set_tests_properties(test_communicator_half_async PROPERTIES TIMEOUT 120)
   set_tests_properties(test_dist_fleet_ctr2 PROPERTIES TIMEOUT 200)
   set_tests_properties(test_dist_fleet_sparse_embedding_ctr PROPERTIES TIMEOUT
                                                                        200)
   set_tests_properties(test_dist_fleet_infer PROPERTIES TIMEOUT 200)
-  set_tests_properties(test_dist_fleet_raw_program_optimizer PROPERTIES TIMEOUT
-                                                                        120)
   set_tests_properties(test_dist_fleet_raw_program_optimizer_fuse_allreduce
                        PROPERTIES TIMEOUT 60)
   set_tests_properties(test_dist_dygraph_apis PROPERTIES TIMEOUT 120)
 endif()
 
-if(WITH_DISTRIBUTE AND NOT APPLE)
-  if(WITH_GPU OR WITH_ROCM)
-    set_tests_properties(test_c_comm_init_op PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_dist_mnist_gradient_merge PROPERTIES TIMEOUT 360)
-  endif()
-endif()
-
 # setting timeout value as 15S
 set_tests_properties(test_run PROPERTIES TIMEOUT 120)
 set_tests_properties(test_sync_batch_norm_op PROPERTIES TIMEOUT 120)
@@ -1564,56 +1274,12 @@ set_tests_properties(test_graph_send_uv_op PROPERTIES TIMEOUT 60)
 if(WITH_DISTRIBUTE
    AND WITH_GPU
    AND WITH_NCCL)
-  set_tests_properties(test_parallel_dygraph_dataparallel PROPERTIES TIMEOUT
-                                                                     120)
-  set_tests_properties(test_parallel_dygraph_mnist PROPERTIES TIMEOUT 200)
-  set_tests_properties(test_parallel_dygraph_se_resnext PROPERTIES TIMEOUT 200)
-  set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT
-                                                                         350)
-  set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT
-                                                                     350)
-  set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT 300)
-  set_tests_properties(test_parallel_dygraph_no_sync_gradient_check
-                       PROPERTIES TIMEOUT 60)
-  set_tests_properties(test_parallel_dygraph_pipeline_parallel
-                       PROPERTIES TIMEOUT 500)
-  set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT
-                                                                        200)
-  set_tests_properties(test_parallel_dygraph_sharding_parallel
-                       PROPERTIES TIMEOUT 120)
-  set_tests_properties(test_dygraph_sharding_optimizer_stage2 PROPERTIES TIMEOUT
-                                                                         120)
-  set_tests_properties(test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 200)
-  set_tests_properties(test_dygraph_sharding_stage3 PROPERTIES TIMEOUT 350)
-  set_tests_properties(test_dygraph_sharding_stage3_for_eager PROPERTIES TIMEOUT
-                                                                         350)
-  set_tests_properties(test_dygraph_group_sharded_api PROPERTIES TIMEOUT 120)
-  set_tests_properties(test_dygraph_group_sharded_api_for_eager
-                       PROPERTIES TIMEOUT 120)
-  set_tests_properties(test_auto_parallel_parallelizer PROPERTIES TIMEOUT 120)
-  set_tests_properties(test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120)
-  set_tests_properties(test_hybrid_parallel_inference_helper PROPERTIES TIMEOUT
-                                                                        120)
-  set_tests_properties(test_parallel_class_center_sample PROPERTIES TIMEOUT 120)
-  set_tests_properties(test_parallel_margin_cross_entropy PROPERTIES TIMEOUT
-                                                                     120)
+  set_tests_properties(
+    test_parallel_dygraph_pipeline_parallel_with_virtual_stage
+    PROPERTIES TIMEOUT 500)
   set_tests_properties(test_auto_parallel_data_unshard PROPERTIES TIMEOUT 120)
   set_tests_properties(test_auto_parallel_save_load PROPERTIES TIMEOUT 120)
   set_tests_properties(test_auto_parallel_autoconvert PROPERTIES TIMEOUT 120)
-  set_tests_properties(test_collective_process_group PROPERTIES TIMEOUT 120)
-  set_tests_properties(test_collective_alltoall_single PROPERTIES TIMEOUT 60)
-  set_tests_properties(test_eager_dist_api PROPERTIES TIMEOUT 100)
-  set_tests_properties(test_collective_batch_isend_irecv PROPERTIES TIMEOUT 100)
-  set_tests_properties(test_collective_reduce_scatter PROPERTIES TIMEOUT 100)
-  set_tests_properties(test_parallel_dygraph_qat PROPERTIES TIMEOUT 120)
-  if(${NCCL_VERSION} VERSION_GREATER_EQUAL 2212)
-    set_tests_properties(test_parallel_dygraph_sparse_embedding
-                         PROPERTIES TIMEOUT 200)
-    set_tests_properties(test_parallel_dygraph_transformer PROPERTIES TIMEOUT
-                                                                      200)
-    set_tests_properties(test_parallel_dygraph_sparse_embedding_over_height
-                         PROPERTIES TIMEOUT 150)
-  endif()
 endif()
 
 if(APPLE)
@@ -1626,63 +1292,23 @@ if(APPLE)
 endif()
 
 if((WITH_ROCM OR WITH_GPU) AND NOT WIN32)
-  set_tests_properties(test_collective_allgather_api PROPERTIES TIMEOUT 300)
-  set_tests_properties(test_collective_allgather_object_api PROPERTIES TIMEOUT
-                                                                       120)
-  set_tests_properties(test_collective_alltoall_api PROPERTIES TIMEOUT 120)
-  set_tests_properties(test_collective_global_gather PROPERTIES TIMEOUT 200)
-  set_tests_properties(test_collective_global_scatter PROPERTIES TIMEOUT 200)
-  set_tests_properties(test_collective_sendrecv_api PROPERTIES TIMEOUT 120)
-  set_tests_properties(test_collective_broadcast_api PROPERTIES TIMEOUT 120)
-  set_tests_properties(test_collective_allreduce_api PROPERTIES TIMEOUT 120)
   if(WITH_DISTRIBUTE)
-    set_tests_properties(test_new_group_api PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_pipeline PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_ir_pass_pipeline PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_static_model_parallel PROPERTIES TIMEOUT 240)
     set_tests_properties(test_static_model_parallel_fused_feedforward
                          PROPERTIES TIMEOUT 120)
     set_tests_properties(test_static_model_parallel_fused_attention
                          PROPERTIES TIMEOUT 120)
     set_tests_properties(test_static_model_parallel_fused_multi_transformer
                          PROPERTIES TIMEOUT 120)
-    set_tests_properties(
-      test_collective_split_embedding_none_divisible
-      test_collective_split_row_linear
-      test_collective_split_col_linear
-      test_collective_scatter_api
-      test_collective_barrier_api
-      test_collective_reduce_api
-      test_pipeline_parallel
-      test_collective_allreduce_api
-      test_new_group_api
-      test_collective_broadcast_api
-      test_collective_allgather_api
-      test_collective_allgather_object_api
-      test_collective_alltoall_api
-      test_collective_global_gather
-      test_collective_global_scatter
-      PROPERTIES LABELS "RUN_TYPE=DIST")
+    set_tests_properties(test_pipeline_parallel PROPERTIES LABELS
+                                                           "RUN_TYPE=DIST")
+    set_tests_properties(test_reducescatter_api PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_reducescatter PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_allgather PROPERTIES TIMEOUT 120)
   endif()
   set_tests_properties(test_paddle_multiprocessing PROPERTIES TIMEOUT 120)
-  set_tests_properties(test_reducescatter_api PROPERTIES TIMEOUT 120)
-  set_tests_properties(test_reducescatter PROPERTIES TIMEOUT 120)
-  set_tests_properties(test_collective_reduce_api PROPERTIES TIMEOUT 120)
   set_tests_properties(test_pipeline_parallel PROPERTIES TIMEOUT 120)
-  set_tests_properties(test_collective_reduce PROPERTIES TIMEOUT 120)
-  set_tests_properties(test_allgather PROPERTIES TIMEOUT 120)
-  set_tests_properties(test_collective_scatter_api PROPERTIES TIMEOUT 120)
-  set_tests_properties(test_collective_barrier_api PROPERTIES TIMEOUT 120)
-  set_tests_properties(test_collective_scatter PROPERTIES TIMEOUT 120)
-  set_tests_properties(test_collective_sendrecv PROPERTIES TIMEOUT 120)
 endif()
 if(WITH_GPU OR WITH_ROCM)
-  set_tests_properties(test_imperative_auto_mixed_precision PROPERTIES TIMEOUT
-                                                                       300)
-  set_tests_properties(test_imperative_auto_mixed_precision_for_eager
-                       PROPERTIES TIMEOUT 300)
-  set_tests_properties(test_parallel_dygraph_sync_batch_norm PROPERTIES TIMEOUT
-                                                                        120)
   set_tests_properties(test_rank_attention_op PROPERTIES TIMEOUT 120)
 endif()
 set_tests_properties(test_inplace_addto_strategy PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt b/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt
index 6775b238ed9e7..a69973eeb9208 100644
--- a/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt
@@ -6,64 +6,271 @@ set(LOCAL_ALL_ARCH ON)
 set(LOCAL_ALL_PLAT ON)
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
-    test_allreduce
-    MODULES
-    test_allreduce
+    test_allreduce MODULES test_allreduce ENVS
+    "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=")
+  set_tests_properties(test_allreduce PROPERTIES TIMEOUT "120" RUN_SERIAL 1)
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_broadcast MODULES test_broadcast ENVS
+    "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=")
+  set_tests_properties(test_broadcast PROPERTIES TIMEOUT "120" RUN_SERIAL 1)
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_c_concat MODULES test_c_concat ENVS
+    "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=")
+  set_tests_properties(test_c_concat PROPERTIES TIMEOUT "120" RUN_SERIAL 1)
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_c_identity MODULES test_c_identity ENVS
+    "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=")
+  set_tests_properties(test_c_identity PROPERTIES TIMEOUT "120" RUN_SERIAL 1)
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_c_split MODULES test_c_split ENVS
+    "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=")
+  set_tests_properties(test_c_split PROPERTIES TIMEOUT "120" RUN_SERIAL 1)
+endif()
+if((WITH_ROCM OR WITH_GPU) AND (LINUX))
+  bash_test_modules(
+    test_collective_split_embedding
+    START_BASH
+    ../dist_test.sh
+    LABELS
+    "RUN_TYPE=DIST"
     ENVS
-    "PADDLE_DIST_UT_PORT=20071;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy="
+    "PADDLE_DIST_UT_PORT=21288;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy="
   )
-  set_tests_properties(test_allreduce PROPERTIES TIMEOUT "120" RUN_SERIAL 1)
+  set_tests_properties(test_collective_split_embedding PROPERTIES TIMEOUT "300"
+                                                                  RUN_SERIAL 1)
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_collective_allgather_api MODULES test_collective_allgather_api ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_collective_allgather_api PROPERTIES TIMEOUT "300"
+                                                                RUN_SERIAL 1)
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_collective_allgather_object_api MODULES
+    test_collective_allgather_object_api ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_collective_allgather_object_api
+                       PROPERTIES TIMEOUT "120" RUN_SERIAL 1)
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
-    test_broadcast
-    MODULES
-    test_broadcast
+    test_collective_allreduce_api MODULES test_collective_allreduce_api ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_collective_allreduce_api PROPERTIES TIMEOUT "120"
+                                                                RUN_SERIAL 1)
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_collective_alltoall_api MODULES test_collective_alltoall_api ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_collective_alltoall_api PROPERTIES TIMEOUT "120"
+                                                               RUN_SERIAL 1)
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  bash_test_modules(
+    test_collective_alltoall_single
+    START_BASH
+    ../dist_test.sh
+    LABELS
+    "RUN_TYPE=DIST"
     ENVS
-    "PADDLE_DIST_UT_PORT=20073;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy="
+    "PADDLE_DIST_UT_PORT=21290;http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python"
   )
-  set_tests_properties(test_broadcast PROPERTIES TIMEOUT "120" RUN_SERIAL 1)
+  set_tests_properties(test_collective_alltoall_single PROPERTIES TIMEOUT "350"
+                                                                  RUN_SERIAL 1)
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
-    test_c_concat
-    MODULES
-    test_c_concat
+    test_collective_barrier_api MODULES test_collective_barrier_api ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_collective_barrier_api PROPERTIES TIMEOUT "300"
+                                                              RUN_SERIAL 1)
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  bash_test_modules(
+    test_collective_batch_isend_irecv
+    START_BASH
+    ../dist_test.sh
+    LABELS
+    "RUN_TYPE=DIST"
     ENVS
-    "PADDLE_DIST_UT_PORT=20075;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy="
+    "PADDLE_DIST_UT_PORT=21292;http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python"
   )
-  set_tests_properties(test_c_concat PROPERTIES TIMEOUT "120" RUN_SERIAL 1)
+  set_tests_properties(test_collective_batch_isend_irecv
+                       PROPERTIES TIMEOUT "350" RUN_SERIAL 1)
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_collective_broadcast_api MODULES test_collective_broadcast_api ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_collective_broadcast_api PROPERTIES TIMEOUT "120"
+                                                                RUN_SERIAL 1)
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_collective_cpu_barrier_with_gloo MODULES
+    test_collective_cpu_barrier_with_gloo ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_collective_cpu_barrier_with_gloo
+                       PROPERTIES TIMEOUT "300" RUN_SERIAL 1)
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
-    test_c_identity
-    MODULES
-    test_c_identity
+    test_collective_global_gather MODULES test_collective_global_gather ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_collective_global_gather PROPERTIES TIMEOUT "200"
+                                                                RUN_SERIAL 1)
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_collective_global_scatter MODULES test_collective_global_scatter ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_collective_global_scatter PROPERTIES TIMEOUT "200"
+                                                                 RUN_SERIAL 1)
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_collective_optimizer MODULES test_collective_optimizer ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_collective_optimizer PROPERTIES TIMEOUT "300"
+                                                            RUN_SERIAL 1)
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  bash_test_modules(
+    test_collective_process_group
+    START_BASH
+    ../dist_test.sh
+    LABELS
+    "RUN_TYPE=DIST"
     ENVS
-    "PADDLE_DIST_UT_PORT=20077;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy="
+    "PADDLE_DIST_UT_PORT=21294;http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python"
   )
-  set_tests_properties(test_c_identity PROPERTIES TIMEOUT "120" RUN_SERIAL 1)
+  set_tests_properties(test_collective_process_group PROPERTIES TIMEOUT "350"
+                                                                RUN_SERIAL 1)
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_collective_reduce MODULES test_collective_reduce ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_collective_reduce PROPERTIES TIMEOUT "300"
+                                                         RUN_SERIAL 1)
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
-    test_c_split
-    MODULES
-    test_c_split
+    test_collective_reduce_api MODULES test_collective_reduce_api ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_collective_reduce_api PROPERTIES TIMEOUT "300"
+                                                             RUN_SERIAL 1)
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  bash_test_modules(
+    test_collective_reduce_scatter
+    START_BASH
+    ../dist_test.sh
+    LABELS
+    "RUN_TYPE=DIST"
     ENVS
-    "PADDLE_DIST_UT_PORT=20079;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy="
+    "PADDLE_DIST_UT_PORT=21296;http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python"
   )
-  set_tests_properties(test_c_split PROPERTIES TIMEOUT "120" RUN_SERIAL 1)
+  set_tests_properties(test_collective_reduce_scatter PROPERTIES TIMEOUT "350"
+                                                                 RUN_SERIAL 1)
 endif()
-if((WITH_ROCM OR WITH_GPU) AND (LINUX))
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_collective_scatter MODULES test_collective_scatter ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_collective_scatter PROPERTIES TIMEOUT "300"
+                                                          RUN_SERIAL 1)
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_collective_scatter_api MODULES test_collective_scatter_api ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_collective_scatter_api PROPERTIES TIMEOUT "300"
+                                                              RUN_SERIAL 1)
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_collective_sendrecv MODULES test_collective_sendrecv ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_collective_sendrecv PROPERTIES TIMEOUT "300"
+                                                           RUN_SERIAL 1)
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_collective_sendrecv_api MODULES test_collective_sendrecv_api ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_collective_sendrecv_api PROPERTIES TIMEOUT "120"
+                                                               RUN_SERIAL 1)
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_collective_split_col_linear MODULES test_collective_split_col_linear
+    ENVS "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_collective_split_col_linear
+                       PROPERTIES TIMEOUT "300" RUN_SERIAL 1)
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_collective_split_embedding_none_divisible MODULES
+    test_collective_split_embedding_none_divisible ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_collective_split_embedding_none_divisible
+                       PROPERTIES TIMEOUT "300" RUN_SERIAL 1)
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_collective_split_row_linear MODULES test_collective_split_row_linear
+    ENVS "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_collective_split_row_linear
+                       PROPERTIES TIMEOUT "300" RUN_SERIAL 1)
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_collective_wait MODULES test_collective_wait ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_collective_wait PROPERTIES TIMEOUT "300" RUN_SERIAL
+                                                       1)
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_eager_dist_api MODULES test_eager_dist_api ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_eager_dist_api PROPERTIES TIMEOUT "120" RUN_SERIAL
+                                                      1)
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_new_group_api MODULES test_new_group_api ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_new_group_api PROPERTIES TIMEOUT "120" RUN_SERIAL 1)
+endif()
+if((WITH_GPU
+    OR WITH_ROCM
+    OR WITH_ASCEND
+    OR WITH_ASCEND_CL
+   )
+   AND LOCAL_ALL_PLAT)
   bash_test_modules(
-    test_collective_split_embedding
+    test_gen_nccl_id_op
     START_BASH
     ../dist_test.sh
     LABELS
     "RUN_TYPE=DIST"
     ENVS
-    "PADDLE_DIST_UT_PORT=20081;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy="
+    "PADDLE_DIST_UT_PORT=21298;http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python"
   )
-  set_tests_properties(test_collective_split_embedding PROPERTIES TIMEOUT "300"
-                                                                  RUN_SERIAL 1)
+  set_tests_properties(test_gen_nccl_id_op PROPERTIES RUN_SERIAL 1)
 endif()
+add_subdirectory(fleet)
+add_subdirectory(multinode)
diff --git a/python/paddle/fluid/tests/unittests/collective/README.md b/python/paddle/fluid/tests/unittests/collective/README.md
index f819de2484469..f34a177570cd1 100644
--- a/python/paddle/fluid/tests/unittests/collective/README.md
+++ b/python/paddle/fluid/tests/unittests/collective/README.md
@@ -6,16 +6,15 @@
     and specify the properties for the new unit test
     the properties are the following:  
 * `name`: the test's name
-* `os`: The supported operator system, ignoring case. If the test run in multiple operator systems, use ";" to split systems, forexample, `apple;linux` means the test runs on both Apple and Linux. The supported values are `linux`,`win32` and `apple`. If the value is empty, this means the test runs on all opertaor systems.
-* `arch`: the device's architecture. similar to `os`, multiple valuse ars splited by ";" and ignoring case. The supported architectures are `gpu`, `xpu`, `npu` and `rocm`.
+* `os`: The supported operator system, ignoring case. If the test run in multiple operator systems, use ";" to split systems, for example, `apple;linux` means the test runs on both Apple and Linux. The supported values are `linux`,`win32` and `apple`. If the value is empty, this means the test runs on all opertaor systems.
+* `arch`: the device's architecture. similar to `os`, multiple valuse ars splited by ";" and ignoring case. The supported architectures are `gpu`, `xpu`, `ASCEND`, `ASCEND_CL` and `rocm`.
 * `timeout`: timeout of a unittest, whose unit is second.
 * `run_type`: run_type of a unittest. Supported values are `NIGHTLY`, `EXCLUSIVE`, `CINN`, `DIST`, `GPUPS`, `INFER`, `EXCLUSIVE:NIGHTLY`, `DIST:NIGHTLY`，which are case-insensitive. 
 * `launcher`: the test launcher.Supported values are test_runner.py, dist_test.sh and custom scripts' name.
-* `dist_ut_port`: the starting port used in a distributed unit test
+* `num_port`: the number os port used in a distributed unit test
 * `run_serial`: whether in serial mode. the value can be 1 or 0.Default (empty) is 0.
 * `ENVS`: required environments. multiple envirenmonts are splited by ";".
-* `conditions`: extra required conditions for some tests. the value is a boolean expression in cmake programmer.
-
+* `conditions`: extra required conditions for some tests. The value is a list of boolean expression in cmake programmer, splited with ";". For example, the value can be `WITH_DGC;NOT WITH_NCCL` or `WITH_NCCL;${NCCL_VERSION} VERSION_GREATER_EQUAL 2212`,The relationship between these expressions is a conjunction.
 
 ### step 3. Generate CmakeLists.txt
     Run the cmd:
diff --git a/python/paddle/fluid/tests/unittests/collective_allgather_api.py b/python/paddle/fluid/tests/unittests/collective/collective_allgather_api.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/collective_allgather_api.py
rename to python/paddle/fluid/tests/unittests/collective/collective_allgather_api.py
diff --git a/python/paddle/fluid/tests/unittests/collective_allgather_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/collective_allgather_api_dygraph.py
rename to python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py
diff --git a/python/paddle/fluid/tests/unittests/collective_allgather_object_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_allgather_object_api_dygraph.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/collective_allgather_object_api_dygraph.py
rename to python/paddle/fluid/tests/unittests/collective/collective_allgather_object_api_dygraph.py
diff --git a/python/paddle/fluid/tests/unittests/collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/collective_allreduce_api.py
rename to python/paddle/fluid/tests/unittests/collective/collective_allreduce_api.py
diff --git a/python/paddle/fluid/tests/unittests/collective_allreduce_new_group_api.py b/python/paddle/fluid/tests/unittests/collective/collective_allreduce_new_group_api.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/collective_allreduce_new_group_api.py
rename to python/paddle/fluid/tests/unittests/collective/collective_allreduce_new_group_api.py
diff --git a/python/paddle/fluid/tests/unittests/collective_allreduce_op_wait.py b/python/paddle/fluid/tests/unittests/collective/collective_allreduce_op_wait.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/collective_allreduce_op_wait.py
rename to python/paddle/fluid/tests/unittests/collective/collective_allreduce_op_wait.py
diff --git a/python/paddle/fluid/tests/unittests/collective_alltoall_api.py b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/collective_alltoall_api.py
rename to python/paddle/fluid/tests/unittests/collective/collective_alltoall_api.py
diff --git a/python/paddle/fluid/tests/unittests/collective_alltoall_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/collective_alltoall_api_dygraph.py
rename to python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py
diff --git a/python/paddle/fluid/tests/unittests/collective_alltoall_single.py b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/collective_alltoall_single.py
rename to python/paddle/fluid/tests/unittests/collective/collective_alltoall_single.py
diff --git a/python/paddle/fluid/tests/unittests/collective_barrier_api.py b/python/paddle/fluid/tests/unittests/collective/collective_barrier_api.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/collective_barrier_api.py
rename to python/paddle/fluid/tests/unittests/collective/collective_barrier_api.py
diff --git a/python/paddle/fluid/tests/unittests/collective_batch_isend_irecv.py b/python/paddle/fluid/tests/unittests/collective/collective_batch_isend_irecv.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/collective_batch_isend_irecv.py
rename to python/paddle/fluid/tests/unittests/collective/collective_batch_isend_irecv.py
diff --git a/python/paddle/fluid/tests/unittests/collective_broadcast_api.py b/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/collective_broadcast_api.py
rename to python/paddle/fluid/tests/unittests/collective/collective_broadcast_api.py
diff --git a/python/paddle/fluid/tests/unittests/collective_global_gather.py b/python/paddle/fluid/tests/unittests/collective/collective_global_gather.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/collective_global_gather.py
rename to python/paddle/fluid/tests/unittests/collective/collective_global_gather.py
diff --git a/python/paddle/fluid/tests/unittests/collective_global_gather_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_global_gather_dygraph.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/collective_global_gather_dygraph.py
rename to python/paddle/fluid/tests/unittests/collective/collective_global_gather_dygraph.py
diff --git a/python/paddle/fluid/tests/unittests/collective_global_scatter.py b/python/paddle/fluid/tests/unittests/collective/collective_global_scatter.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/collective_global_scatter.py
rename to python/paddle/fluid/tests/unittests/collective/collective_global_scatter.py
diff --git a/python/paddle/fluid/tests/unittests/collective_global_scatter_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_global_scatter_dygraph.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/collective_global_scatter_dygraph.py
rename to python/paddle/fluid/tests/unittests/collective/collective_global_scatter_dygraph.py
diff --git a/python/paddle/fluid/tests/unittests/collective_reduce_api.py b/python/paddle/fluid/tests/unittests/collective/collective_reduce_api.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/collective_reduce_api.py
rename to python/paddle/fluid/tests/unittests/collective/collective_reduce_api.py
diff --git a/python/paddle/fluid/tests/unittests/collective_reduce_op.py b/python/paddle/fluid/tests/unittests/collective/collective_reduce_op.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/collective_reduce_op.py
rename to python/paddle/fluid/tests/unittests/collective/collective_reduce_op.py
diff --git a/python/paddle/fluid/tests/unittests/collective_reduce_op_calc_stream.py b/python/paddle/fluid/tests/unittests/collective/collective_reduce_op_calc_stream.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/collective_reduce_op_calc_stream.py
rename to python/paddle/fluid/tests/unittests/collective/collective_reduce_op_calc_stream.py
diff --git a/python/paddle/fluid/tests/unittests/collective_reduce_scatter.py b/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/collective_reduce_scatter.py
rename to python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter.py
diff --git a/python/paddle/fluid/tests/unittests/collective_scatter_api.py b/python/paddle/fluid/tests/unittests/collective/collective_scatter_api.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/collective_scatter_api.py
rename to python/paddle/fluid/tests/unittests/collective/collective_scatter_api.py
diff --git a/python/paddle/fluid/tests/unittests/collective_scatter_op.py b/python/paddle/fluid/tests/unittests/collective/collective_scatter_op.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/collective_scatter_op.py
rename to python/paddle/fluid/tests/unittests/collective/collective_scatter_op.py
diff --git a/python/paddle/fluid/tests/unittests/collective_sendrecv_api.py b/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/collective_sendrecv_api.py
rename to python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api.py
diff --git a/python/paddle/fluid/tests/unittests/collective_sendrecv_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/collective_sendrecv_api_dygraph.py
rename to python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py
diff --git a/python/paddle/fluid/tests/unittests/collective_sendrecv_op.py b/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_op.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/collective_sendrecv_op.py
rename to python/paddle/fluid/tests/unittests/collective/collective_sendrecv_op.py
diff --git a/python/paddle/fluid/tests/unittests/collective_sendrecv_op_array.py b/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_op_array.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/collective_sendrecv_op_array.py
rename to python/paddle/fluid/tests/unittests/collective/collective_sendrecv_op_array.py
diff --git a/python/paddle/fluid/tests/unittests/collective_sendrecv_op_dynamic_shape.py b/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_op_dynamic_shape.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/collective_sendrecv_op_dynamic_shape.py
rename to python/paddle/fluid/tests/unittests/collective/collective_sendrecv_op_dynamic_shape.py
diff --git a/python/paddle/fluid/tests/unittests/column_parallel_linear_api.py b/python/paddle/fluid/tests/unittests/collective/column_parallel_linear_api.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/column_parallel_linear_api.py
rename to python/paddle/fluid/tests/unittests/collective/column_parallel_linear_api.py
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/CMakeLists.txt b/python/paddle/fluid/tests/unittests/collective/fleet/CMakeLists.txt
new file mode 100644
index 0000000000000..1d1555839a0fd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/CMakeLists.txt
@@ -0,0 +1,865 @@
+# This file is generated by ${PADDLE_ROOT}/tools/gen_ut_cmakelists.py.
+# Please don't modify this file manually.
+# If you need to change unittests in this file, please modify testslist.csv in the current directory
+# and then run the command `python3 ${PADDLE_ROOT}/tools/gen_ut_cmakelists.py -f ${CURRENT_DIRECTORY}/testslist.csv`
+set(LOCAL_ALL_ARCH ON)
+set(LOCAL_ALL_PLAT ON)
+if((WITH_GPU
+    OR WITH_XPU
+    OR WITH_ASCEND
+    OR WITH_ASCEND_CL
+   )
+   AND LOCAL_ALL_PLAT)
+  py_test_modules(
+    test_fleet_sharding_meta_optimizer MODULES
+    test_fleet_sharding_meta_optimizer ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_fleet_sharding_meta_optimizer
+                       PROPERTIES TIMEOUT "350" RUN_SERIAL 1)
+endif()
+if(LOCAL_ALL_ARCH AND (LINUX OR WIN32))
+  py_test_modules(
+    test_fleet_static_mp_layers MODULES test_fleet_static_mp_layers ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_fleet_static_mp_layers PROPERTIES RUN_SERIAL 1)
+endif()
+if(WITH_DGC)
+  if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
+    py_test_modules(
+      test_dgc_op MODULES test_dgc_op ENVS
+      "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
+    set_tests_properties(test_dgc_op PROPERTIES RUN_SERIAL 1)
+  endif()
+endif()
+if(WITH_DGC)
+  if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
+    py_test_modules(
+      test_dgc_optimizer MODULES test_dgc_optimizer ENVS
+      "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
+    set_tests_properties(test_dgc_optimizer PROPERTIES RUN_SERIAL 1)
+  endif()
+endif()
+if(WITH_NCCL)
+  if((WITH_GPU) AND LOCAL_ALL_PLAT)
+    bash_test_modules(
+      test_parallel_margin_cross_entropy
+      START_BASH
+      ../../dist_test.sh
+      LABELS
+      "RUN_TYPE=DIST"
+      ENVS
+      "PADDLE_DIST_UT_PORT=21200;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+    )
+    set_tests_properties(test_parallel_margin_cross_entropy
+                         PROPERTIES TIMEOUT "120" RUN_SERIAL 1)
+  endif()
+endif()
+if(WITH_NCCL)
+  if((WITH_GPU) AND LOCAL_ALL_PLAT)
+    bash_test_modules(
+      test_dygraph_sharding_stage3
+      START_BASH
+      ../../dist_test.sh
+      LABELS
+      "RUN_TYPE=DIST"
+      ENVS
+      "PADDLE_DIST_UT_PORT=21202;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+    )
+    set_tests_properties(test_dygraph_sharding_stage3 PROPERTIES TIMEOUT "350"
+                                                                 RUN_SERIAL 1)
+  endif()
+endif()
+if(WITH_NCCL)
+  if(${NCCL_VERSION} VERSION_GREATER_EQUAL 2212)
+    if((WITH_GPU) AND LOCAL_ALL_PLAT)
+      bash_test_modules(
+        test_parallel_dygraph_transformer
+        START_BASH
+        ../../dist_test.sh
+        LABELS
+        "RUN_TYPE=DIST"
+        ENVS
+        "PADDLE_DIST_UT_PORT=21204;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+      )
+      set_tests_properties(test_parallel_dygraph_transformer
+                           PROPERTIES RUN_SERIAL 1)
+    endif()
+  endif()
+endif()
+if((WITH_ROCM) AND LOCAL_ALL_PLAT)
+  bash_test_modules(
+    test_parallel_dygraph_transformer
+    START_BASH
+    ../../dist_test.sh
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=21206;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
+  set_tests_properties(test_parallel_dygraph_transformer PROPERTIES RUN_SERIAL
+                                                                    1)
+endif()
+if(LOCAL_ALL_ARCH AND (LINUX OR WIN32))
+  py_test_modules(
+    test_fleet_fp16_allreduce_meta_optimizer MODULES
+    test_fleet_fp16_allreduce_meta_optimizer ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_fleet_fp16_allreduce_meta_optimizer
+                       PROPERTIES RUN_SERIAL 1)
+endif()
+if((WITH_GPU
+    OR WITH_XPU
+    OR WITH_ASCEND
+    OR WITH_ASCEND_CL
+   )
+   AND LOCAL_ALL_PLAT)
+  bash_test_modules(
+    test_rnn_dp
+    START_BASH
+    ../../dist_test.sh
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=21208;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
+  set_tests_properties(test_rnn_dp PROPERTIES RUN_SERIAL 1)
+endif()
+if(WITH_NCCL)
+  if((WITH_GPU) AND LOCAL_ALL_PLAT)
+    bash_test_modules(
+      test_parallel_dygraph_mp_layers
+      START_BASH
+      ../../dist_test.sh
+      LABELS
+      "RUN_TYPE=DIST"
+      ENVS
+      "PADDLE_DIST_UT_PORT=21210;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+    )
+    set_tests_properties(test_parallel_dygraph_mp_layers
+                         PROPERTIES TIMEOUT "120" RUN_SERIAL 1)
+  endif()
+endif()
+if(LOCAL_ALL_ARCH AND (LINUX OR APPLE))
+  bash_test_modules(
+    test_tcp_store
+    START_BASH
+    ../../dist_test.sh
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=21212;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
+  set_tests_properties(test_tcp_store PROPERTIES RUN_SERIAL 1)
+endif()
+if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
+  bash_test_modules(
+    test_dygraph_sharding_stage3_for_eager
+    START_BASH
+    ../../dist_test.sh
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=21214;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
+  set_tests_properties(test_dygraph_sharding_stage3_for_eager
+                       PROPERTIES TIMEOUT "350" RUN_SERIAL 1)
+endif()
+if((WITH_GPU
+    OR WITH_XPU
+    OR WITH_ASCEND
+    OR WITH_ASCEND_CL
+   )
+   AND LOCAL_ALL_PLAT)
+  bash_test_modules(
+    test_fleet_graph_execution_meta_optimizer
+    START_BASH
+    ../../dist_test.sh
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=21216;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
+  set_tests_properties(test_fleet_graph_execution_meta_optimizer
+                       PROPERTIES RUN_SERIAL 1)
+endif()
+if(WITH_NCCL)
+  if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
+    py_test_modules(
+      test_communicator_half_async
+      MODULES
+      test_communicator_half_async
+      ENVS
+      "FLAGS_communicator_send_queue_size=1;FLAGS_communicator_max_merge_var_num=1;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+    )
+    set_tests_properties(test_communicator_half_async PROPERTIES TIMEOUT "120"
+                                                                 RUN_SERIAL 1)
+  endif()
+endif()
+if((WITH_GPU
+    OR WITH_XPU
+    OR WITH_ASCEND
+    OR WITH_ASCEND_CL
+   )
+   AND LOCAL_ALL_PLAT)
+  py_test_modules(
+    test_fleet_graph_executor MODULES test_fleet_graph_executor ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_fleet_graph_executor PROPERTIES RUN_SERIAL 1)
+endif()
+if((WITH_GPU) AND LOCAL_ALL_PLAT)
+  bash_test_modules(
+    test_parallel_dygraph_pipeline_parallel
+    START_BASH
+    ../../dist_test.sh
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=21218;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
+  set_tests_properties(test_parallel_dygraph_pipeline_parallel
+                       PROPERTIES TIMEOUT "500" RUN_SERIAL 1)
+endif()
+if((WITH_GPU
+    OR WITH_XPU
+    OR WITH_ASCEND
+    OR WITH_ASCEND_CL
+   )
+   AND (LINUX))
+  py_test_modules(
+    test_fleet_localsgd_meta_optimizer MODULES
+    test_fleet_localsgd_meta_optimizer ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_fleet_localsgd_meta_optimizer PROPERTIES RUN_SERIAL
+                                                                     1)
+endif()
+if(WITH_NCCL)
+  if((WITH_GPU) AND LOCAL_ALL_PLAT)
+    bash_test_modules(
+      test_parallel_class_center_sample
+      START_BASH
+      ../../dist_test.sh
+      LABELS
+      "RUN_TYPE=DIST"
+      ENVS
+      "PADDLE_DIST_UT_PORT=21220;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+    )
+    set_tests_properties(test_parallel_class_center_sample
+                         PROPERTIES TIMEOUT "120" RUN_SERIAL 1)
+  endif()
+endif()
+if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
+  bash_test_modules(
+    test_pipeline
+    START_BASH
+    ../../dist_test.sh
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=21222;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
+  set_tests_properties(test_pipeline PROPERTIES TIMEOUT "120" RUN_SERIAL 1)
+endif()
+if(LOCAL_ALL_ARCH AND (LINUX OR APPLE))
+  py_test_modules(
+    test_fleet_utils MODULES test_fleet_utils ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_fleet_utils PROPERTIES TIMEOUT "120" RUN_SERIAL 1)
+endif()
+if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
+  bash_test_modules(
+    test_static_model_parallel
+    START_BASH
+    ../../dist_test.sh
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=21224;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
+  set_tests_properties(test_static_model_parallel PROPERTIES TIMEOUT "240"
+                                                             RUN_SERIAL 1)
+endif()
+if(WITH_NCCL)
+  if((WITH_GPU) AND LOCAL_ALL_PLAT)
+    bash_test_modules(
+      test_parallel_dygraph_no_sync
+      START_BASH
+      ../../dist_test.sh
+      LABELS
+      "RUN_TYPE=DIST"
+      ENVS
+      "PADDLE_DIST_UT_PORT=21226;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+    )
+    set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT "300"
+                                                                  RUN_SERIAL 1)
+  endif()
+endif()
+if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
+  bash_test_modules(
+    test_dygraph_sharding_stage2
+    START_BASH
+    ../../dist_test.sh
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=21228;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
+  set_tests_properties(test_dygraph_sharding_stage2 PROPERTIES TIMEOUT "200"
+                                                               RUN_SERIAL 1)
+endif()
+if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
+  bash_test_modules(
+    test_parallel_dygraph_control_flow
+    START_BASH
+    ../../dist_test.sh
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=21230;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
+  set_tests_properties(test_parallel_dygraph_control_flow
+                       PROPERTIES TIMEOUT "350" RUN_SERIAL 1)
+endif()
+if((WITH_GPU
+    OR WITH_XPU
+    OR WITH_ASCEND
+    OR WITH_ASCEND_CL
+   )
+   AND LOCAL_ALL_PLAT)
+  bash_test_modules(
+    test_fleet_lars_meta_optimizer
+    START_BASH
+    ../../dist_test.sh
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=21232;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
+  set_tests_properties(test_fleet_lars_meta_optimizer PROPERTIES RUN_SERIAL 1)
+endif()
+if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
+  bash_test_modules(
+    test_hybrid_parallel_inference_helper
+    START_BASH
+    ../../dist_test.sh
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=21234;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
+  set_tests_properties(test_hybrid_parallel_inference_helper
+                       PROPERTIES TIMEOUT "120" RUN_SERIAL 1)
+endif()
+if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
+  py_test_modules(
+    test_fleet_rolemaker_new MODULES test_fleet_rolemaker_new ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_fleet_rolemaker_new PROPERTIES RUN_SERIAL 1)
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX OR WIN32))
+  py_test_modules(
+    test_dist_mnist_gradient_merge MODULES test_dist_mnist_gradient_merge ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_dist_mnist_gradient_merge PROPERTIES TIMEOUT "360"
+                                                                 RUN_SERIAL 1)
+endif()
+if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
+  py_test_modules(
+    test_recv_save_op MODULES test_recv_save_op ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_recv_save_op PROPERTIES RUN_SERIAL 1)
+endif()
+if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
+  py_test_modules(
+    test_communicator_sync
+    MODULES
+    test_communicator_sync
+    ENVS
+    "FLAGS_communicator_send_queue_size=1;FLAGS_communicator_max_merge_var_num=1;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
+  set_tests_properties(test_communicator_sync PROPERTIES RUN_SERIAL 1)
+endif()
+if((WITH_GPU
+    OR WITH_XPU
+    OR WITH_ASCEND
+    OR WITH_ASCEND_CL
+   )
+   AND LOCAL_ALL_PLAT)
+  bash_test_modules(
+    test_fleet_pipeline_meta_optimizer
+    START_BASH
+    ../../dist_test.sh
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=21236;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
+  set_tests_properties(test_fleet_pipeline_meta_optimizer PROPERTIES RUN_SERIAL
+                                                                     1)
+endif()
+if((WITH_GPU
+    OR WITH_XPU
+    OR WITH_ASCEND
+    OR WITH_ASCEND_CL
+   )
+   AND LOCAL_ALL_PLAT)
+  py_test_modules(
+    test_fleet_gradient_merge_meta_optimizer MODULES
+    test_fleet_gradient_merge_meta_optimizer ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_fleet_gradient_merge_meta_optimizer
+                       PROPERTIES RUN_SERIAL 1)
+endif()
+if(LOCAL_ALL_ARCH AND (LINUX OR WIN32))
+  py_test_modules(
+    test_fleet_amp_init MODULES test_fleet_amp_init ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_fleet_amp_init PROPERTIES RUN_SERIAL 1)
+endif()
+if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
+  bash_test_modules(
+    test_dygraph_sharding_optimizer_stage2
+    START_BASH
+    ../../dist_test.sh
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=21238;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
+  set_tests_properties(test_dygraph_sharding_optimizer_stage2
+                       PROPERTIES TIMEOUT "120" RUN_SERIAL 1)
+endif()
+if(LOCAL_ALL_ARCH AND (LINUX OR WIN32))
+  py_test_modules(
+    test_fleet_meta_optimizer_base MODULES test_fleet_meta_optimizer_base ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_fleet_meta_optimizer_base PROPERTIES RUN_SERIAL 1)
+endif()
+if((WITH_GPU
+    OR WITH_XPU
+    OR WITH_ASCEND
+    OR WITH_ASCEND_CL
+   )
+   AND LOCAL_ALL_PLAT)
+  bash_test_modules(
+    test_fleet_raw_program_meta_optimizer
+    START_BASH
+    ../../dist_test.sh
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=21240;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
+  set_tests_properties(test_fleet_raw_program_meta_optimizer
+                       PROPERTIES RUN_SERIAL 1)
+endif()
+if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
+  bash_test_modules(
+    test_parallel_dygraph_sharding_parallel
+    START_BASH
+    ../../dist_test.sh
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=21242;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
+  set_tests_properties(test_parallel_dygraph_sharding_parallel
+                       PROPERTIES TIMEOUT "120" RUN_SERIAL 1)
+endif()
+if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
+  bash_test_modules(
+    test_parallel_dygraph_tensor_parallel
+    START_BASH
+    ../../dist_test.sh
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=21244;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
+  set_tests_properties(test_parallel_dygraph_tensor_parallel
+                       PROPERTIES TIMEOUT "200" RUN_SERIAL 1)
+endif()
+if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
+  bash_test_modules(
+    test_dygraph_group_sharded_api_for_eager
+    START_BASH
+    ../../dist_test.sh
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=21246;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
+  set_tests_properties(test_dygraph_group_sharded_api_for_eager
+                       PROPERTIES TIMEOUT "120" RUN_SERIAL 1)
+endif()
+if(LOCAL_ALL_ARCH AND (LINUX OR WIN32))
+  py_test_modules(
+    test_fleet_distributed_strategy MODULES test_fleet_distributed_strategy
+    ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_fleet_distributed_strategy PROPERTIES RUN_SERIAL 1)
+endif()
+if(WITH_DGC)
+  if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
+    py_test_modules(
+      test_fleet_dgc_meta_optimizer MODULES test_fleet_dgc_meta_optimizer ENVS
+      "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
+    set_tests_properties(test_fleet_dgc_meta_optimizer PROPERTIES RUN_SERIAL 1)
+  endif()
+endif()
+if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
+  bash_test_modules(
+    test_parallel_dygraph_unused_variables
+    START_BASH
+    ../../dist_test.sh
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=21248;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
+  set_tests_properties(test_parallel_dygraph_unused_variables
+                       PROPERTIES TIMEOUT "350" RUN_SERIAL 1)
+endif()
+if((WITH_GPU
+    OR WITH_XPU
+    OR WITH_ASCEND
+    OR WITH_ASCEND_CL
+   )
+   AND (LINUX))
+  py_test_modules(
+    test_fleet_lamb_meta_optimizer MODULES test_fleet_lamb_meta_optimizer ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_fleet_lamb_meta_optimizer PROPERTIES RUN_SERIAL 1)
+endif()
+if(WITH_DGC)
+  if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
+    py_test_modules(
+      test_dgc_momentum_op MODULES test_dgc_momentum_op ENVS
+      "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
+    set_tests_properties(test_dgc_momentum_op PROPERTIES RUN_SERIAL 1)
+  endif()
+endif()
+if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
+  bash_test_modules(
+    test_parallel_dygraph_no_sync_gradient_check
+    START_BASH
+    ../../dist_test.sh
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=21250;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
+  set_tests_properties(test_parallel_dygraph_no_sync_gradient_check
+                       PROPERTIES TIMEOUT "60" RUN_SERIAL 1)
+endif()
+if((WITH_GPU
+    OR WITH_XPU
+    OR WITH_ASCEND
+    OR WITH_ASCEND_CL
+   )
+   AND LOCAL_ALL_PLAT)
+  bash_test_modules(
+    test_fleet_pipeline_meta_optimizer_with_recompute
+    START_BASH
+    ../../dist_test.sh
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=21252;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
+  set_tests_properties(test_fleet_pipeline_meta_optimizer_with_recompute
+                       PROPERTIES RUN_SERIAL 1)
+endif()
+if((WITH_GPU
+    OR WITH_XPU
+    OR WITH_ASCEND
+    OR WITH_ASCEND_CL
+   )
+   AND (WIN32 OR LINUX))
+  py_test_modules(
+    test_fleet_hybrid_meta_optimizer MODULES test_fleet_hybrid_meta_optimizer
+    ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_fleet_hybrid_meta_optimizer PROPERTIES RUN_SERIAL 1)
+endif()
+if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
+  bash_test_modules(
+    test_parallel_dygraph_qat
+    START_BASH
+    ../../dist_test.sh
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=21254;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
+  set_tests_properties(test_parallel_dygraph_qat PROPERTIES TIMEOUT "120"
+                                                            RUN_SERIAL 1)
+endif()
+if(WITH_NCCL)
+  if(${NCCL_VERSION} VERSION_GREATER_EQUAL 2212)
+    if((WITH_GPU) AND LOCAL_ALL_PLAT)
+      bash_test_modules(
+        test_parallel_dygraph_sparse_embedding
+        START_BASH
+        ../../dist_test.sh
+        LABELS
+        "RUN_TYPE=DIST"
+        ENVS
+        "PADDLE_DIST_UT_PORT=21256;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+      )
+      set_tests_properties(test_parallel_dygraph_sparse_embedding
+                           PROPERTIES TIMEOUT "200" RUN_SERIAL 1)
+    endif()
+  endif()
+endif()
+if((WITH_ROCM) AND LOCAL_ALL_PLAT)
+  bash_test_modules(
+    test_parallel_dygraph_sparse_embedding
+    START_BASH
+    ../../dist_test.sh
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=21258;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
+  set_tests_properties(test_parallel_dygraph_sparse_embedding
+                       PROPERTIES TIMEOUT "200" RUN_SERIAL 1)
+endif()
+if((WITH_GPU
+    OR WITH_XPU
+    OR WITH_ASCEND
+    OR WITH_ASCEND_CL
+   )
+   AND LOCAL_ALL_PLAT)
+  py_test_modules(
+    test_fleet_amp_meta_optimizer MODULES test_fleet_amp_meta_optimizer ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_fleet_amp_meta_optimizer PROPERTIES RUN_SERIAL 1)
+endif()
+if(WITH_NCCL)
+  if(${NCCL_VERSION} VERSION_GREATER_EQUAL 2212)
+    if((WITH_GPU) AND LOCAL_ALL_PLAT)
+      bash_test_modules(
+        test_parallel_dygraph_sparse_embedding_over_height
+        START_BASH
+        ../../dist_test.sh
+        LABELS
+        "RUN_TYPE=DIST"
+        ENVS
+        "PADDLE_DIST_UT_PORT=21260;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+      )
+      set_tests_properties(test_parallel_dygraph_sparse_embedding_over_height
+                           PROPERTIES TIMEOUT "150" RUN_SERIAL 1)
+    endif()
+  endif()
+endif()
+if((WITH_ROCM) AND LOCAL_ALL_PLAT)
+  bash_test_modules(
+    test_parallel_dygraph_sparse_embedding_over_height
+    START_BASH
+    ../../dist_test.sh
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=21262;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
+  set_tests_properties(test_parallel_dygraph_sparse_embedding_over_height
+                       PROPERTIES TIMEOUT "350" RUN_SERIAL 1)
+endif()
+if(LOCAL_ALL_ARCH AND (LINUX OR APPLE))
+  py_test_modules(
+    test_distributed_strategy MODULES test_distributed_strategy ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_distributed_strategy PROPERTIES RUN_SERIAL 1)
+endif()
+if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
+  bash_test_modules(
+    test_auto_parallel_parallelizer
+    START_BASH
+    ../../dist_test.sh
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=21264;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
+  set_tests_properties(test_auto_parallel_parallelizer PROPERTIES TIMEOUT "120"
+                                                                  RUN_SERIAL 1)
+endif()
+if((WITH_GPU
+    OR WITH_XPU
+    OR WITH_ASCEND
+    OR WITH_ASCEND_CL
+   )
+   AND (LINUX OR WIN32))
+  py_test_modules(
+    test_fleet_recompute_meta_optimizer MODULES
+    test_fleet_recompute_meta_optimizer ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_fleet_recompute_meta_optimizer PROPERTIES RUN_SERIAL
+                                                                      1)
+endif()
+if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
+  bash_test_modules(
+    test_dygraph_group_sharded_api
+    START_BASH
+    ../../dist_test.sh
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=21266;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
+  set_tests_properties(test_dygraph_group_sharded_api PROPERTIES TIMEOUT "120"
+                                                                 RUN_SERIAL 1)
+endif()
+if(LOCAL_ALL_ARCH AND (LINUX OR WIN32))
+  py_test_modules(
+    test_fleet_private_function MODULES test_fleet_private_function ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_fleet_private_function PROPERTIES RUN_SERIAL 1)
+endif()
+if((WITH_GPU
+    OR WITH_XPU
+    OR WITH_ASCEND
+    OR WITH_ASCEND_CL
+   )
+   AND LOCAL_ALL_PLAT)
+  bash_test_modules(
+    test_new_group
+    START_BASH
+    test_new_group.sh
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=21268;http_proxy=;https_proxy=")
+  set_tests_properties(test_new_group PROPERTIES RUN_SERIAL 1)
+endif()
+if((WITH_GPU
+    OR WITH_XPU
+    OR WITH_ASCEND
+    OR WITH_ASCEND_CL
+   )
+   AND (LINUX))
+  bash_test_modules(
+    test_c_comm_init_op
+    START_BASH
+    test_c_comm_init_op.sh
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=21270;http_proxy=;https_proxy=")
+  set_tests_properties(test_c_comm_init_op PROPERTIES TIMEOUT "120" RUN_SERIAL
+                                                      1)
+endif()
+if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
+  bash_test_modules(
+    test_ir_pass_pipeline
+    START_BASH
+    ../../dist_test.sh
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=21272;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
+  set_tests_properties(test_ir_pass_pipeline PROPERTIES TIMEOUT "120"
+                                                        RUN_SERIAL 1)
+endif()
+if((WITH_GPU OR WITH_ROCM) AND LOCAL_ALL_PLAT)
+  bash_test_modules(
+    test_parallel_dygraph_mnist
+    START_BASH
+    ../../dist_test.sh
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=21274;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
+  set_tests_properties(test_parallel_dygraph_mnist PROPERTIES TIMEOUT "200"
+                                                              RUN_SERIAL 1)
+endif()
+if((WITH_GPU OR WITH_ROCM) AND LOCAL_ALL_PLAT)
+  bash_test_modules(
+    test_parallel_dygraph_se_resnext
+    START_BASH
+    ../../dist_test.sh
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=21276;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
+  set_tests_properties(test_parallel_dygraph_se_resnext
+                       PROPERTIES TIMEOUT "200" RUN_SERIAL 1)
+endif()
+if((WITH_GPU OR WITH_ROCM) AND LOCAL_ALL_PLAT)
+  py_test_modules(
+    test_parallel_dygraph_sync_batch_norm MODULES
+    test_parallel_dygraph_sync_batch_norm ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_parallel_dygraph_sync_batch_norm
+                       PROPERTIES TIMEOUT "120" RUN_SERIAL 1)
+endif()
+if((WITH_GPU OR WITH_ROCM) AND LOCAL_ALL_PLAT)
+  py_test_modules(
+    test_imperative_auto_mixed_precision MODULES
+    test_imperative_auto_mixed_precision ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_imperative_auto_mixed_precision
+                       PROPERTIES TIMEOUT "300" RUN_SERIAL 1)
+endif()
+if((WITH_GPU OR WITH_ROCM) AND LOCAL_ALL_PLAT)
+  py_test_modules(
+    test_imperative_auto_mixed_precision_for_eager MODULES
+    test_imperative_auto_mixed_precision_for_eager ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_imperative_auto_mixed_precision_for_eager
+                       PROPERTIES TIMEOUT "300" RUN_SERIAL 1)
+endif()
+if((WITH_GPU OR WITH_ROCM) AND LOCAL_ALL_PLAT)
+  py_test_modules(
+    test_mixed_precision MODULES test_mixed_precision ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_mixed_precision PROPERTIES RUN_SERIAL 1)
+endif()
+if((WITH_GPU OR WITH_ROCM) AND LOCAL_ALL_PLAT)
+  py_test_modules(
+    test_dygraph_recompute MODULES test_dygraph_recompute ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_dygraph_recompute PROPERTIES RUN_SERIAL 1)
+endif()
+if((WITH_GPU OR WITH_ROCM) AND LOCAL_ALL_PLAT)
+  py_test_modules(
+    test_dygraph_recompute_for_eager MODULES test_dygraph_recompute_for_eager
+    ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_dygraph_recompute_for_eager PROPERTIES RUN_SERIAL 1)
+endif()
+if(WITH_NCCL OR WITH_RCCL)
+  if(WITH_DGC)
+    if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
+      bash_test_modules(
+        test_dist_mnist_dgc_nccl
+        START_BASH
+        ../../dist_test.sh
+        LABELS
+        "RUN_TYPE=DIST"
+        ENVS
+        "PADDLE_DIST_UT_PORT=21278;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+      )
+      set_tests_properties(test_dist_mnist_dgc_nccl PROPERTIES RUN_SERIAL 1)
+    endif()
+  endif()
+endif()
+if(WITH_NCCL OR WITH_RCCL)
+  if(WITH_DGC)
+    if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
+      bash_test_modules(
+        test_dist_se_resnext_dgc
+        START_BASH
+        ../../dist_test.sh
+        LABELS
+        "RUN_TYPE=DIST"
+        ENVS
+        "PADDLE_DIST_UT_PORT=21280;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+      )
+      set_tests_properties(test_dist_se_resnext_dgc PROPERTIES RUN_SERIAL 1)
+    endif()
+  endif()
+endif()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_parallelizer.py b/python/paddle/fluid/tests/unittests/collective/fleet/auto_parallel_parallelizer.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/auto_parallel_parallelizer.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/auto_parallel_parallelizer.py
diff --git a/python/paddle/fluid/tests/unittests/c_comm_init_op.py b/python/paddle/fluid/tests/unittests/collective/fleet/c_comm_init_op.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/c_comm_init_op.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/c_comm_init_op.py
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist_gradient_merge.py b/python/paddle/fluid/tests/unittests/collective/fleet/dist_mnist_gradient_merge.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/dist_mnist_gradient_merge.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/dist_mnist_gradient_merge.py
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist_gradient_merge_raw_optimizer.py b/python/paddle/fluid/tests/unittests/collective/fleet/dist_mnist_gradient_merge_raw_optimizer.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/dist_mnist_gradient_merge_raw_optimizer.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/dist_mnist_gradient_merge_raw_optimizer.py
diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_api.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_api.py
diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api_eager.py b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_api_eager.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/dygraph_group_sharded_api_eager.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_api_eager.py
diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2.py b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage2.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage2.py
diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2_offload.py b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage2_offload.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2_offload.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage2_offload.py
diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage3.py b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage3.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage3.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage3.py
diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage3_offload.py b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage3_offload.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage3_offload.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage3_offload.py
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_optimizer_stage2.py b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_sharding_optimizer_stage2.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/dygraph_sharding_optimizer_stage2.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/dygraph_sharding_optimizer_stage2.py
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_sharding_stage2.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/dygraph_sharding_stage2.py
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_sharding_stage2_offload.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/dygraph_sharding_stage2_offload.py
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_sharding_stage3.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/dygraph_sharding_stage3.py
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_sharding_stage3_offload.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/dygraph_sharding_stage3_offload.py
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_communicate_group.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_communicate_group.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/hybrid_parallel_communicate_group.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_communicate_group.py
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_inference_helper.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_inference_helper.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/hybrid_parallel_inference_helper.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_inference_helper.py
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_amp.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_mp_amp.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/hybrid_parallel_mp_amp.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_mp_amp.py
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_clip_grad.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_mp_clip_grad.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/hybrid_parallel_mp_clip_grad.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_mp_clip_grad.py
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_fp16.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_mp_fp16.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/hybrid_parallel_mp_fp16.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_mp_fp16.py
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_mp_layers.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_mp_layers.py
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_mp_model.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_mp_model.py
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_random.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_mp_random.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/hybrid_parallel_mp_random.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_mp_random.py
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_amp.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_amp.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/hybrid_parallel_pp_amp.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_amp.py
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_clip_grad.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_clip_grad.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/hybrid_parallel_pp_clip_grad.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_clip_grad.py
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_embedding.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_embedding.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/hybrid_parallel_pp_embedding.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_embedding.py
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_fp16.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_fp16.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/hybrid_parallel_pp_fp16.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_fp16.py
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_recompute.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_recompute.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/hybrid_parallel_pp_recompute.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_recompute.py
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_save_load.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_save_load.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/hybrid_parallel_pp_save_load.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_save_load.py
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_transformer.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/hybrid_parallel_pp_transformer.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer.py
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_qat.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_qat.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/hybrid_parallel_qat.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_qat.py
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_sharding_model.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_sharding_model.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/hybrid_parallel_sharding_model.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_sharding_model.py
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_shared_weight.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_shared_weight.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/hybrid_parallel_shared_weight.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_shared_weight.py
diff --git a/python/paddle/fluid/tests/unittests/new_group.py b/python/paddle/fluid/tests/unittests/collective/fleet/new_group.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/new_group.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/new_group.py
diff --git a/python/paddle/fluid/tests/unittests/parallel_class_center_sample.py b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_class_center_sample.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/parallel_class_center_sample.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/parallel_class_center_sample.py
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_different.py b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_control_flow_different.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_different.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_control_flow_different.py
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_same.py b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_control_flow_same.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_same.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_control_flow_same.py
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync.py b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_no_sync.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_no_sync.py
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_control_flow.py b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_no_sync_control_flow.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_control_flow.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_no_sync_control_flow.py
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_gradient_check.py b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_no_sync_gradient_check.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_gradient_check.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_no_sync_gradient_check.py
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_unused_params.py b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_no_sync_unused_params.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_unused_params.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_no_sync_unused_params.py
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_se_resnext.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_se_resnext.py
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_sync_batch_norm.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_sync_batch_norm.py
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_transformer.py b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/parallel_dygraph_transformer.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py
diff --git a/python/paddle/fluid/tests/unittests/parallel_margin_cross_entropy.py b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_margin_cross_entropy.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/parallel_margin_cross_entropy.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/parallel_margin_cross_entropy.py
diff --git a/python/paddle/fluid/tests/unittests/pipeline_mnist.py b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/pipeline_mnist.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist.py
diff --git a/python/paddle/fluid/tests/unittests/pipeline_mnist_multi_device.py b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_multi_device.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/pipeline_mnist_multi_device.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_multi_device.py
diff --git a/python/paddle/fluid/tests/unittests/pipeline_mnist_one_device.py b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_one_device.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/pipeline_mnist_one_device.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_one_device.py
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_by_col.py b/python/paddle/fluid/tests/unittests/collective/fleet/static_model_parallel_by_col.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/static_model_parallel_by_col.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/static_model_parallel_by_col.py
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_by_row.py b/python/paddle/fluid/tests/unittests/collective/fleet/static_model_parallel_by_row.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/static_model_parallel_by_row.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/static_model_parallel_by_row.py
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_embedding.py b/python/paddle/fluid/tests/unittests/collective/fleet/static_model_parallel_embedding.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/static_model_parallel_embedding.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/static_model_parallel_embedding.py
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_parallelizer.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_auto_parallel_parallelizer.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_auto_parallel_parallelizer.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_auto_parallel_parallelizer.py
diff --git a/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh b/python/paddle/fluid/tests/unittests/collective/fleet/test_c_comm_init_op.sh
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_c_comm_init_op.sh
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_half_async.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_communicator_half_async.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_communicator_half_async.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_communicator_half_async.py
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_sync.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_communicator_sync.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_communicator_sync.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_communicator_sync.py
diff --git a/python/paddle/fluid/tests/unittests/test_dgc_momentum_op.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_dgc_momentum_op.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_dgc_momentum_op.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_dgc_momentum_op.py
diff --git a/python/paddle/fluid/tests/unittests/test_dgc_op.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_dgc_op.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_dgc_op.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_dgc_op.py
diff --git a/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_dgc_optimizer.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_dgc_optimizer.py
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_dist_mnist_dgc_nccl.py
similarity index 91%
rename from python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_dist_mnist_dgc_nccl.py
index ec9c0e9680577..80a2bbad461bc 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_dist_mnist_dgc_nccl.py
@@ -49,7 +49,7 @@ def _setup_config(self):
     def test_dist_train(self):
         import paddle.fluid as fluid
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place("dist_mnist.py",
+            self.check_with_place(os.path.abspath("../../dist_mnist.py"),
                                   delta=1e-5,
                                   check_error_log=True,
                                   log_name=flag_name)
@@ -80,10 +80,11 @@ def _setup_config(self):
     def test_dist_train(self):
         import paddle.fluid as fluid
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place_multi_cards("dist_mnist.py",
-                                              delta=1e-5,
-                                              check_error_log=True,
-                                              log_name=flag_name)
+            self.check_with_place_multi_cards(
+                os.path.abspath("../../dist_mnist.py"),
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
 
     def tearDown(self):
         import paddle.fluid as fluid
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_gradient_merge.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_dist_mnist_gradient_merge.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_dist_mnist_gradient_merge.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_dist_mnist_gradient_merge.py
diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_dgc.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_dist_se_resnext_dgc.py
similarity index 94%
rename from python/paddle/fluid/tests/unittests/test_dist_se_resnext_dgc.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_dist_se_resnext_dgc.py
index 86101cf9fe4db..af5495b161948 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_dgc.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_dist_se_resnext_dgc.py
@@ -17,8 +17,6 @@
 from test_dist_base import TestDistBase
 import os
 
-import os
-
 flag_name = os.path.splitext(__file__)[0]
 
 
@@ -35,7 +33,7 @@ def _setup_config(self):
     def test_dist_train(self):
         import paddle.fluid as fluid
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place("dist_se_resnext.py",
+            self.check_with_place(os.path.abspath("../../dist_se_resnext.py"),
                                   delta=30,
                                   check_error_log=True,
                                   log_name=flag_name)
diff --git a/python/paddle/fluid/tests/unittests/test_distributed_strategy.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_distributed_strategy.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_distributed_strategy.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_distributed_strategy.py
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_group_sharded_api.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_group_sharded_api.py
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api_for_eager.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_group_sharded_api_for_eager.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api_for_eager.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_group_sharded_api_for_eager.py
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_recompute.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_dygraph_recompute.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_recompute.py
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_recompute_for_eager.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_recompute_for_eager.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_dygraph_recompute_for_eager.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_recompute_for_eager.py
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_optimizer_stage2.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_sharding_optimizer_stage2.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_dygraph_sharding_optimizer_stage2.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_sharding_optimizer_stage2.py
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_sharding_stage2.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_sharding_stage2.py
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_sharding_stage3.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_sharding_stage3.py
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3_for_eager.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_sharding_stage3_for_eager.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3_for_eager.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_sharding_stage3_for_eager.py
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_amp_init.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_amp_init.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_fleet_amp_init.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_amp_init.py
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_amp_meta_optimizer.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_amp_meta_optimizer.py
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_dgc_meta_optimizer.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_dgc_meta_optimizer.py
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_distributed_strategy.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_distributed_strategy.py
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_fp16_allreduce_meta_optimizer.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_fp16_allreduce_meta_optimizer.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_fleet_fp16_allreduce_meta_optimizer.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_fp16_allreduce_meta_optimizer.py
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_gradient_merge_meta_optimizer.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_gradient_merge_meta_optimizer.py
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_graph_execution_meta_optimizer.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_graph_execution_meta_optimizer.py
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_graph_executor.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_graph_executor.py
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_hybrid_meta_optimizer.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_hybrid_meta_optimizer.py
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_lamb_meta_optimizer.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_lamb_meta_optimizer.py
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_lars_meta_optimizer.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_lars_meta_optimizer.py
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_localsgd_meta_optimizer.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_localsgd_meta_optimizer.py
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_meta_optimizer_base.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_meta_optimizer_base.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_fleet_meta_optimizer_base.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_meta_optimizer_base.py
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_pipeline_meta_optimizer.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_pipeline_meta_optimizer.py
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer_with_recompute.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_pipeline_meta_optimizer_with_recompute.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer_with_recompute.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_pipeline_meta_optimizer_with_recompute.py
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_private_function.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_private_function.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_fleet_private_function.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_private_function.py
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_raw_program_meta_optimizer.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_raw_program_meta_optimizer.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_fleet_raw_program_meta_optimizer.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_raw_program_meta_optimizer.py
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_recompute_meta_optimizer.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_recompute_meta_optimizer.py
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_rolemaker_new.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_rolemaker_new.py
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_sharding_meta_optimizer.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_sharding_meta_optimizer.py
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_static_mp_layers.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_static_mp_layers.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_fleet_static_mp_layers.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_static_mp_layers.py
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_utils.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_utils.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_fleet_utils.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_utils.py
diff --git a/python/paddle/fluid/tests/unittests/test_hybrid_parallel_inference_helper.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_hybrid_parallel_inference_helper.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_hybrid_parallel_inference_helper.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_hybrid_parallel_inference_helper.py
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision.py
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision_for_eager.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision_for_eager.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py
diff --git a/python/paddle/fluid/tests/unittests/test_ir_pass_pipeline.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_ir_pass_pipeline.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_ir_pass_pipeline.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_ir_pass_pipeline.py
diff --git a/python/paddle/fluid/tests/unittests/test_mixed_precision.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_mixed_precision.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_mixed_precision.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_mixed_precision.py
diff --git a/python/paddle/fluid/tests/unittests/test_new_group.sh b/python/paddle/fluid/tests/unittests/collective/fleet/test_new_group.sh
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_new_group.sh
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_new_group.sh
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_class_center_sample.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_class_center_sample.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_parallel_class_center_sample.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_class_center_sample.py
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_control_flow.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_control_flow.py
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_mnist.py
similarity index 75%
rename from python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_mnist.py
index e25a74863e483..78df9976031f2 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_mnist.py
@@ -36,10 +36,11 @@ def _setup_config(self):
 
     def test_mnist(self):
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place("parallel_dygraph_mnist.py",
-                                  delta=1e-5,
-                                  check_error_log=True,
-                                  log_name=flag_name)
+            self.check_with_place(
+                os.path.abspath("../../parallel_dygraph_mnist.py"),
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
 
 
 #TODO(liuyuhui): Multi-Card Baidu Kunlun XPU training exist accuracy problems
@@ -55,10 +56,11 @@ def _setup_config(self):
 
     def test_mnist_xpu(self):
         if fluid.core.is_compiled_with_xpu():
-            self.check_with_place("parallel_dygraph_mnist.py",
-                                  delta=1e-4,
-                                  check_error_log=True,
-                                  log_name=flag_name)
+            self.check_with_place(
+                os.path.abspath("../../parallel_dygraph_mnist.py"),
+                delta=1e-4,
+                check_error_log=True,
+                log_name=flag_name)
 
 
 class TestParallelDygraphMnistSpawn(TestDistSpawnRunner):
@@ -80,10 +82,11 @@ def _setup_config(self):
 
     def test_mnist(self):
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place("parallel_dygraph_mnist.py",
-                                  delta=1e-5,
-                                  check_error_log=True,
-                                  log_name=flag_name)
+            self.check_with_place(
+                os.path.abspath("../../parallel_dygraph_mnist.py"),
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
 
 
 class TestFleetDygraphMnistXPU(TestDistBase):
@@ -97,10 +100,11 @@ def _setup_config(self):
 
     def test_mnist(self):
         if fluid.core.is_compiled_with_xpu():
-            self.check_with_place("parallel_dygraph_mnist.py",
-                                  delta=1e-4,
-                                  check_error_log=True,
-                                  log_name=flag_name)
+            self.check_with_place(
+                os.path.abspath("../../parallel_dygraph_mnist.py"),
+                delta=1e-4,
+                check_error_log=True,
+                log_name=flag_name)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mp_layers.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_mp_layers.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_parallel_dygraph_mp_layers.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_mp_layers.py
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_no_sync.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_no_sync.py
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync_gradient_check.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_no_sync_gradient_check.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync_gradient_check.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_no_sync_gradient_check.py
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_pipeline_parallel.py
similarity index 92%
rename from python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_pipeline_parallel.py
index 5357a6a132a34..9bbae0928a779 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_pipeline_parallel.py
@@ -24,8 +24,11 @@
 class TestHybridPipeParallel(TestMultipleGpus):
 
     def test_hybrid_parallel_pp_layer(self):
-        self.run_mnist_2gpu('hybrid_parallel_pp_layer.py')
-        self.run_mnist_2gpu('hybrid_parallel_pp_layer.py', eager_mode=False)
+        self.run_mnist_2gpu(
+            os.path.abspath('../../hybrid_parallel_pp_layer.py'))
+        self.run_mnist_2gpu(
+            os.path.abspath('../../hybrid_parallel_pp_layer.py'),
+            eager_mode=False)
 
     def test_hybrid_parallel_pp_tuple_inputs(self):
         self.run_mnist_2gpu('hybrid_parallel_pp_embedding.py')
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_qat.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_qat.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_parallel_dygraph_qat.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_qat.py
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_se_resnext.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_se_resnext.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_parallel_dygraph_se_resnext.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_se_resnext.py
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sharding_parallel.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_sharding_parallel.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_parallel_dygraph_sharding_parallel.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_sharding_parallel.py
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_sparse_embedding.py
similarity index 86%
rename from python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_sparse_embedding.py
index ae65b545a9534..e38a8e578582d 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_sparse_embedding.py
@@ -36,10 +36,11 @@ def _setup_config(self):
 
     def test_sparse_embedding(self):
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place("parallel_dygraph_sparse_embedding.py",
-                                  delta=1e-5,
-                                  check_error_log=True,
-                                  log_name=flag_name)
+            self.check_with_place(
+                os.path.abspath("../../parallel_dygraph_sparse_embedding.py"),
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
 
 
 class TestParallelDygraphSparseEmdeddingFP64(TestDistBase):
@@ -51,7 +52,8 @@ def _setup_config(self):
 
     def test_sparse_embedding_fp64(self):
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place("parallel_dygraph_sparse_embedding_fp64.py",
+            self.check_with_place(os.path.abspath(
+                "../../parallel_dygraph_sparse_embedding_fp64.py"),
                                   delta=1e-5,
                                   check_error_log=True,
                                   log_name=flag_name)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_over_height.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_sparse_embedding_over_height.py
similarity index 84%
rename from python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_over_height.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_sparse_embedding_over_height.py
index 7b1cd0efcdf27..d7a28d241ff92 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_over_height.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_sparse_embedding_over_height.py
@@ -35,11 +35,11 @@ def _setup_config(self):
 
     def test_sparse_embedding(self):
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "parallel_dygraph_sparse_embedding_over_height.py",
-                delta=1e-5,
-                check_error_log=True,
-                log_name=flag_name)
+            self.check_with_place(os.path.abspath(
+                "../../parallel_dygraph_sparse_embedding_over_height.py"),
+                                  delta=1e-5,
+                                  check_error_log=True,
+                                  log_name=flag_name)
 
 
 class TestParallelDygraphSparseEmdeddingOverHeightSpawn(TestDistSpawnRunner):
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_sync_batch_norm.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_sync_batch_norm.py
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_tensor_parallel.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_tensor_parallel.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_parallel_dygraph_tensor_parallel.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_tensor_parallel.py
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_transformer.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_transformer.py
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_unused_variables.py
similarity index 76%
rename from python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_unused_variables.py
index 1f71514cc7372..172cab2acfd11 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_unused_variables.py
@@ -35,10 +35,11 @@ def _setup_config(self):
 
     def test_net(self):
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place("parallel_dygraph_unused_variables.py",
-                                  delta=1e-5,
-                                  check_error_log=True,
-                                  log_name=flag_name)
+            self.check_with_place(
+                os.path.abspath("../../parallel_dygraph_unused_variables.py"),
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
 
 
 class TestFleetDygraphUnusedVar(TestParallelDygraphUnusedVar):
@@ -67,10 +68,11 @@ def _setup_config(self):
 
     def test_net(self):
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place("parallel_dygraph_none_var.py",
-                                  delta=1e-5,
-                                  check_error_log=True,
-                                  log_name=flag_name)
+            self.check_with_place(
+                os.path.abspath("../../parallel_dygraph_none_var.py"),
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
 
 
 class TestParallelDygraphSharedUnusedVariables(TestDistBase):
@@ -82,10 +84,11 @@ def _setup_config(self):
 
     def test_mnist(self):
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place("parallel_dygraph_shared_unused_var.py",
-                                  delta=1e-5,
-                                  check_error_log=True,
-                                  log_name=flag_name)
+            self.check_with_place(
+                os.path.abspath("../../parallel_dygraph_shared_unused_var.py"),
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_margin_cross_entropy.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_margin_cross_entropy.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_parallel_margin_cross_entropy.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_margin_cross_entropy.py
diff --git a/python/paddle/fluid/tests/unittests/test_pipeline.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_pipeline.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_pipeline.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_pipeline.py
diff --git a/python/paddle/fluid/tests/unittests/test_recv_save_op.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_recv_save_op.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_recv_save_op.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_recv_save_op.py
diff --git a/python/paddle/fluid/tests/unittests/test_rnn_dp.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_rnn_dp.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_rnn_dp.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_rnn_dp.py
diff --git a/python/paddle/fluid/tests/unittests/test_static_model_parallel.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_static_model_parallel.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_static_model_parallel.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_static_model_parallel.py
diff --git a/python/paddle/fluid/tests/unittests/test_tcp_store.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_tcp_store.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_tcp_store.py
rename to python/paddle/fluid/tests/unittests/collective/fleet/test_tcp_store.py
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/testslist.csv b/python/paddle/fluid/tests/unittests/collective/fleet/testslist.csv
new file mode 100644
index 0000000000000..71ba94a02b698
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/testslist.csv
@@ -0,0 +1,73 @@
+name,os,arch,timeout,run_type,launcher,num_port,run_serial,envs,conditions
+test_fleet_sharding_meta_optimizer,,GPU;XPU;ASCEND;ASCEND_CL,350,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_fleet_static_mp_layers,linux;win32,,,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_dgc_op,,,,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_DGC
+test_dgc_optimizer,,,,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_DGC
+test_parallel_margin_cross_entropy,,GPU,120,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_NCCL
+test_dygraph_sharding_stage3,,GPU,350,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_NCCL
+test_parallel_dygraph_transformer,,GPU,,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_NCCL;${NCCL_VERSION} VERSION_GREATER_EQUAL 2212
+test_parallel_dygraph_transformer,,ROCM,,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_fleet_fp16_allreduce_meta_optimizer,LINUX;WIN32,,,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_rnn_dp,,GPU;XPU;ASCEND;ASCEND_CL,,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_parallel_dygraph_mp_layers,,GPU,120,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_NCCL
+test_tcp_store,LINUX;APPLE,,,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_dygraph_sharding_stage3_for_eager,,,350,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_fleet_graph_execution_meta_optimizer,,GPU;XPU;ASCEND;ASCEND_CL,,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_communicator_half_async,,,120,DIST,test_runner.py,2,1,FLAGS_communicator_send_queue_size=1;FLAGS_communicator_max_merge_var_num=1;http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_NCCL
+test_fleet_graph_executor,,GPU;XPU;ASCEND;ASCEND_CL,,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_parallel_dygraph_pipeline_parallel,,GPU,500,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_fleet_localsgd_meta_optimizer,LINUX,GPU;XPU;ASCEND;ASCEND_CL,,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_parallel_class_center_sample,,GPU,120,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_NCCL
+test_pipeline,,,120,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_fleet_utils,LINUX;APPLE,,120,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_static_model_parallel,,,240,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_parallel_dygraph_no_sync,,GPU,300,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_NCCL
+test_dygraph_sharding_stage2,,,200,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_parallel_dygraph_control_flow,,,350,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_fleet_lars_meta_optimizer,,GPU;XPU;ASCEND;ASCEND_CL,,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_hybrid_parallel_inference_helper,,,120,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_fleet_rolemaker_new,,,,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_dist_mnist_gradient_merge,LINUX;WIN32,GPU;ROCM,360,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_recv_save_op,,,,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_communicator_sync,,,,DIST,test_runner.py,2,1,FLAGS_communicator_send_queue_size=1;FLAGS_communicator_max_merge_var_num=1;http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_fleet_pipeline_meta_optimizer,,GPU;XPU;ASCEND;ASCEND_CL,,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_fleet_gradient_merge_meta_optimizer,,GPU;XPU;ASCEND;ASCEND_CL,,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_fleet_amp_init,linux;win32,,,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_dygraph_sharding_optimizer_stage2,,,120,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_fleet_meta_optimizer_base,linux;win32,,,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_fleet_raw_program_meta_optimizer,,GPU;XPU;ASCEND;ASCEND_CL,,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_parallel_dygraph_sharding_parallel,,,120,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_parallel_dygraph_tensor_parallel,,,200,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_dygraph_group_sharded_api_for_eager,,,120,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_fleet_distributed_strategy,linux;win32,,,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_fleet_dgc_meta_optimizer,,,,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_DGC
+test_parallel_dygraph_unused_variables,,,350,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_fleet_lamb_meta_optimizer,linux,GPU;XPU;ASCEND;ASCEND_CL,,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_dgc_momentum_op,,,,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_DGC
+test_parallel_dygraph_no_sync_gradient_check,,,60,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_fleet_pipeline_meta_optimizer_with_recompute,,GPU;XPU;ASCEND;ASCEND_CL,,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_fleet_hybrid_meta_optimizer,WIN32;LINUX,GPU;XPU;ASCEND;ASCEND_CL,,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_parallel_dygraph_qat,,,120,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_parallel_dygraph_sparse_embedding,,GPU,200,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_NCCL;${NCCL_VERSION} VERSION_GREATER_EQUAL 2212
+test_parallel_dygraph_sparse_embedding,,ROCM,200,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_fleet_amp_meta_optimizer,,GPU;XPU;ASCEND;ASCEND_CL,,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_parallel_dygraph_sparse_embedding_over_height,,GPU,150,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_NCCL;${NCCL_VERSION} VERSION_GREATER_EQUAL 2212
+test_parallel_dygraph_sparse_embedding_over_height,,ROCM,350,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_distributed_strategy,LINUX;APPLE,,,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_auto_parallel_parallelizer,,,120,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_fleet_recompute_meta_optimizer,linux;win32,GPU;XPU;ASCEND;ASCEND_CL,,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_dygraph_group_sharded_api,,,120,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_fleet_private_function,linux;win32,,,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_new_group,,GPU;XPU;ASCEND;ASCEND_CL,,DIST,test_new_group.sh,2,1,http_proxy=;https_proxy=,
+test_c_comm_init_op,linux,GPU;XPU;ASCEND;ASCEND_CL,120,DIST,test_c_comm_init_op.sh,2,1,http_proxy=;https_proxy=,
+test_ir_pass_pipeline,,,120,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_parallel_dygraph_mnist,,GPU;ROCM,200,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_parallel_dygraph_se_resnext,,GPU;ROCM,200,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_parallel_dygraph_sync_batch_norm,,GPU;ROCM,120,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_imperative_auto_mixed_precision,,GPU;ROCM,300,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_imperative_auto_mixed_precision_for_eager,,GPU;ROCM,300,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_mixed_precision,,GPU;ROCM,,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_dygraph_recompute,,GPU;ROCM,,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_dygraph_recompute_for_eager,,GPU;ROCM,,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_dist_mnist_dgc_nccl,,,,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_NCCL OR WITH_RCCL;WITH_DGC
+test_dist_se_resnext_dgc,,,,DIST,../../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_NCCL OR WITH_RCCL;WITH_DGC
diff --git a/python/paddle/fluid/tests/unittests/init_process_group.py b/python/paddle/fluid/tests/unittests/collective/init_process_group.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/init_process_group.py
rename to python/paddle/fluid/tests/unittests/collective/init_process_group.py
diff --git a/python/paddle/fluid/tests/unittests/collective/multinode/CMakeLists.txt b/python/paddle/fluid/tests/unittests/collective/multinode/CMakeLists.txt
new file mode 100644
index 0000000000000..6947c89a2a21a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/multinode/CMakeLists.txt
@@ -0,0 +1,48 @@
+# This file is generated by ${PADDLE_ROOT}/tools/gen_ut_cmakelists.py.
+# Please don't modify this file manually.
+# If you need to change unittests in this file, please modify testslist.csv in the current directory
+# and then run the command `python3 ${PADDLE_ROOT}/tools/gen_ut_cmakelists.py -f ${CURRENT_DIRECTORY}/testslist.csv`
+set(LOCAL_ALL_ARCH ON)
+set(LOCAL_ALL_PLAT ON)
+if(WITH_MULTINODE_TESTING)
+  if((WITH_GPU) AND (LINUX))
+    bash_test_modules(
+      test_multinode_dygraph_hybrid_dpppmp
+      START_BASH
+      multinode_dist_test.sh
+      LABELS
+      "RUN_TYPE=DIST"
+      ENVS
+      "PADDLE_DIST_UT_PORT=21282;http_proxy=;https_proxy=")
+    set_tests_properties(test_multinode_dygraph_hybrid_dpppmp
+                         PROPERTIES TIMEOUT "120" RUN_SERIAL 1)
+  endif()
+endif()
+if(WITH_MULTINODE_TESTING)
+  if((WITH_GPU) AND (LINUX))
+    bash_test_modules(
+      test_multinode_dygraph_hybrid_dp
+      START_BASH
+      multinode_dist_test.sh
+      LABELS
+      "RUN_TYPE=DIST"
+      ENVS
+      "PADDLE_DIST_UT_PORT=21284;http_proxy=;https_proxy=")
+    set_tests_properties(test_multinode_dygraph_hybrid_dp
+                         PROPERTIES TIMEOUT "120" RUN_SERIAL 1)
+  endif()
+endif()
+if(WITH_MULTINODE_TESTING)
+  if((WITH_GPU) AND (LINUX))
+    bash_test_modules(
+      test_multinode_dygraph_sharding
+      START_BASH
+      multinode_dist_test.sh
+      LABELS
+      "RUN_TYPE=DIST"
+      ENVS
+      "PADDLE_DIST_UT_PORT=21286;http_proxy=;https_proxy=")
+    set_tests_properties(test_multinode_dygraph_sharding
+                         PROPERTIES TIMEOUT "120" RUN_SERIAL 1)
+  endif()
+endif()
diff --git a/python/paddle/fluid/tests/unittests/common.py b/python/paddle/fluid/tests/unittests/collective/multinode/common.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/common.py
rename to python/paddle/fluid/tests/unittests/collective/multinode/common.py
diff --git a/python/paddle/fluid/tests/unittests/dygraph_hybrid_dp.py b/python/paddle/fluid/tests/unittests/collective/multinode/dygraph_hybrid_dp.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/dygraph_hybrid_dp.py
rename to python/paddle/fluid/tests/unittests/collective/multinode/dygraph_hybrid_dp.py
diff --git a/python/paddle/fluid/tests/unittests/dygraph_hybrid_dpppmp.py b/python/paddle/fluid/tests/unittests/collective/multinode/dygraph_hybrid_dpppmp.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/dygraph_hybrid_dpppmp.py
rename to python/paddle/fluid/tests/unittests/collective/multinode/dygraph_hybrid_dpppmp.py
diff --git a/python/paddle/fluid/tests/unittests/dygraph_hybrid_fp16.py b/python/paddle/fluid/tests/unittests/collective/multinode/dygraph_hybrid_fp16.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/dygraph_hybrid_fp16.py
rename to python/paddle/fluid/tests/unittests/collective/multinode/dygraph_hybrid_fp16.py
diff --git a/python/paddle/fluid/tests/unittests/dygraph_hybrid_recompute.py b/python/paddle/fluid/tests/unittests/collective/multinode/dygraph_hybrid_recompute.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/dygraph_hybrid_recompute.py
rename to python/paddle/fluid/tests/unittests/collective/multinode/dygraph_hybrid_recompute.py
diff --git a/python/paddle/fluid/tests/unittests/mn_dygraph_group_sharded_stage3.py b/python/paddle/fluid/tests/unittests/collective/multinode/mn_dygraph_group_sharded_stage3.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/mn_dygraph_group_sharded_stage3.py
rename to python/paddle/fluid/tests/unittests/collective/multinode/mn_dygraph_group_sharded_stage3.py
diff --git a/python/paddle/fluid/tests/unittests/mn_dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/collective/multinode/mn_dygraph_sharding_stage2.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/mn_dygraph_sharding_stage2.py
rename to python/paddle/fluid/tests/unittests/collective/multinode/mn_dygraph_sharding_stage2.py
diff --git a/python/paddle/fluid/tests/unittests/multinode_dist_test.sh b/python/paddle/fluid/tests/unittests/collective/multinode/multinode_dist_test.sh
similarity index 100%
rename from python/paddle/fluid/tests/unittests/multinode_dist_test.sh
rename to python/paddle/fluid/tests/unittests/collective/multinode/multinode_dist_test.sh
diff --git a/python/paddle/fluid/tests/unittests/test_collective_multi_nodes.py b/python/paddle/fluid/tests/unittests/collective/multinode/test_collective_multi_nodes.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_collective_multi_nodes.py
rename to python/paddle/fluid/tests/unittests/collective/multinode/test_collective_multi_nodes.py
diff --git a/python/paddle/fluid/tests/unittests/test_multinode_dygraph_hybrid_dp.py b/python/paddle/fluid/tests/unittests/collective/multinode/test_multinode_dygraph_hybrid_dp.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_multinode_dygraph_hybrid_dp.py
rename to python/paddle/fluid/tests/unittests/collective/multinode/test_multinode_dygraph_hybrid_dp.py
diff --git a/python/paddle/fluid/tests/unittests/test_multinode_dygraph_hybrid_dpppmp.py b/python/paddle/fluid/tests/unittests/collective/multinode/test_multinode_dygraph_hybrid_dpppmp.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_multinode_dygraph_hybrid_dpppmp.py
rename to python/paddle/fluid/tests/unittests/collective/multinode/test_multinode_dygraph_hybrid_dpppmp.py
diff --git a/python/paddle/fluid/tests/unittests/test_multinode_dygraph_sharding.py b/python/paddle/fluid/tests/unittests/collective/multinode/test_multinode_dygraph_sharding.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_multinode_dygraph_sharding.py
rename to python/paddle/fluid/tests/unittests/collective/multinode/test_multinode_dygraph_sharding.py
diff --git a/python/paddle/fluid/tests/unittests/collective/multinode/testslist.csv b/python/paddle/fluid/tests/unittests/collective/multinode/testslist.csv
new file mode 100644
index 0000000000000..8ada8ed0be313
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/multinode/testslist.csv
@@ -0,0 +1,4 @@
+name,os,arch,timeout,run_type,launcher,num_port,run_serial,ENVS,conditions
+test_multinode_dygraph_hybrid_dpppmp,linux,gpu,120,DIST,multinode_dist_test.sh,8,1,http_proxy=;https_proxy=,WITH_MULTINODE_TESTING
+test_multinode_dygraph_hybrid_dp,linux,gpu,120,DIST,multinode_dist_test.sh,8,1,http_proxy=;https_proxy=,WITH_MULTINODE_TESTING
+test_multinode_dygraph_sharding,linux,gpu,120,DIST,multinode_dist_test.sh,8,1,http_proxy=;https_proxy=,WITH_MULTINODE_TESTING
diff --git a/python/paddle/fluid/tests/unittests/process_group_gloo.py b/python/paddle/fluid/tests/unittests/collective/process_group_gloo.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/process_group_gloo.py
rename to python/paddle/fluid/tests/unittests/collective/process_group_gloo.py
diff --git a/python/paddle/fluid/tests/unittests/process_group_nccl.py b/python/paddle/fluid/tests/unittests/collective/process_group_nccl.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/process_group_nccl.py
rename to python/paddle/fluid/tests/unittests/collective/process_group_nccl.py
diff --git a/python/paddle/fluid/tests/unittests/row_parallel_linear_api.py b/python/paddle/fluid/tests/unittests/collective/row_parallel_linear_api.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/row_parallel_linear_api.py
rename to python/paddle/fluid/tests/unittests/collective/row_parallel_linear_api.py
diff --git a/python/paddle/fluid/tests/unittests/test_collective_allgather_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_collective_allgather_api.py
rename to python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py
diff --git a/python/paddle/fluid/tests/unittests/test_collective_allgather_object_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_object_api.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_collective_allgather_object_api.py
rename to python/paddle/fluid/tests/unittests/collective/test_collective_allgather_object_api.py
diff --git a/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py
rename to python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py
diff --git a/python/paddle/fluid/tests/unittests/test_collective_alltoall_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_collective_alltoall_api.py
rename to python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py
diff --git a/python/paddle/fluid/tests/unittests/test_collective_alltoall_single.py b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_collective_alltoall_single.py
rename to python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single.py
diff --git a/python/paddle/fluid/tests/unittests/test_collective_barrier_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_barrier_api.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_collective_barrier_api.py
rename to python/paddle/fluid/tests/unittests/collective/test_collective_barrier_api.py
diff --git a/python/paddle/fluid/tests/unittests/test_collective_batch_isend_irecv.py b/python/paddle/fluid/tests/unittests/collective/test_collective_batch_isend_irecv.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_collective_batch_isend_irecv.py
rename to python/paddle/fluid/tests/unittests/collective/test_collective_batch_isend_irecv.py
diff --git a/python/paddle/fluid/tests/unittests/test_collective_broadcast_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_collective_broadcast_api.py
rename to python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py
diff --git a/python/paddle/fluid/tests/unittests/test_collective_cpu_barrier_with_gloo.py b/python/paddle/fluid/tests/unittests/collective/test_collective_cpu_barrier_with_gloo.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_collective_cpu_barrier_with_gloo.py
rename to python/paddle/fluid/tests/unittests/collective/test_collective_cpu_barrier_with_gloo.py
diff --git a/python/paddle/fluid/tests/unittests/test_collective_global_gather.py b/python/paddle/fluid/tests/unittests/collective/test_collective_global_gather.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_collective_global_gather.py
rename to python/paddle/fluid/tests/unittests/collective/test_collective_global_gather.py
diff --git a/python/paddle/fluid/tests/unittests/test_collective_global_scatter.py b/python/paddle/fluid/tests/unittests/collective/test_collective_global_scatter.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_collective_global_scatter.py
rename to python/paddle/fluid/tests/unittests/collective/test_collective_global_scatter.py
diff --git a/python/paddle/fluid/tests/unittests/test_collective_optimizer.py b/python/paddle/fluid/tests/unittests/collective/test_collective_optimizer.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_collective_optimizer.py
rename to python/paddle/fluid/tests/unittests/collective/test_collective_optimizer.py
diff --git a/python/paddle/fluid/tests/unittests/test_collective_process_group.py b/python/paddle/fluid/tests/unittests/collective/test_collective_process_group.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_collective_process_group.py
rename to python/paddle/fluid/tests/unittests/collective/test_collective_process_group.py
diff --git a/python/paddle/fluid/tests/unittests/test_collective_reduce.py b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_collective_reduce.py
rename to python/paddle/fluid/tests/unittests/collective/test_collective_reduce.py
diff --git a/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_collective_reduce_api.py
rename to python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py
diff --git a/python/paddle/fluid/tests/unittests/test_collective_reduce_scatter.py b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_collective_reduce_scatter.py
rename to python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter.py
diff --git a/python/paddle/fluid/tests/unittests/test_collective_scatter.py b/python/paddle/fluid/tests/unittests/collective/test_collective_scatter.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_collective_scatter.py
rename to python/paddle/fluid/tests/unittests/collective/test_collective_scatter.py
diff --git a/python/paddle/fluid/tests/unittests/test_collective_scatter_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_collective_scatter_api.py
rename to python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py
diff --git a/python/paddle/fluid/tests/unittests/test_collective_sendrecv.py b/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_collective_sendrecv.py
rename to python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv.py
diff --git a/python/paddle/fluid/tests/unittests/test_collective_sendrecv_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_collective_sendrecv_api.py
rename to python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py
diff --git a/python/paddle/fluid/tests/unittests/test_collective_split_col_linear.py b/python/paddle/fluid/tests/unittests/collective/test_collective_split_col_linear.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_collective_split_col_linear.py
rename to python/paddle/fluid/tests/unittests/collective/test_collective_split_col_linear.py
diff --git a/python/paddle/fluid/tests/unittests/test_collective_split_embedding_none_divisible.py b/python/paddle/fluid/tests/unittests/collective/test_collective_split_embedding_none_divisible.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_collective_split_embedding_none_divisible.py
rename to python/paddle/fluid/tests/unittests/collective/test_collective_split_embedding_none_divisible.py
diff --git a/python/paddle/fluid/tests/unittests/test_collective_split_row_linear.py b/python/paddle/fluid/tests/unittests/collective/test_collective_split_row_linear.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_collective_split_row_linear.py
rename to python/paddle/fluid/tests/unittests/collective/test_collective_split_row_linear.py
diff --git a/python/paddle/fluid/tests/unittests/test_collective_wait.py b/python/paddle/fluid/tests/unittests/collective/test_collective_wait.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_collective_wait.py
rename to python/paddle/fluid/tests/unittests/collective/test_collective_wait.py
diff --git a/python/paddle/fluid/tests/unittests/test_eager_dist_api.py b/python/paddle/fluid/tests/unittests/collective/test_eager_dist_api.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_eager_dist_api.py
rename to python/paddle/fluid/tests/unittests/collective/test_eager_dist_api.py
diff --git a/python/paddle/fluid/tests/unittests/test_gen_nccl_id_op.py b/python/paddle/fluid/tests/unittests/collective/test_gen_nccl_id_op.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_gen_nccl_id_op.py
rename to python/paddle/fluid/tests/unittests/collective/test_gen_nccl_id_op.py
diff --git a/python/paddle/fluid/tests/unittests/test_new_group_api.py b/python/paddle/fluid/tests/unittests/collective/test_new_group_api.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_new_group_api.py
rename to python/paddle/fluid/tests/unittests/collective/test_new_group_api.py
diff --git a/python/paddle/fluid/tests/unittests/collective/testslist.csv b/python/paddle/fluid/tests/unittests/collective/testslist.csv
index 01ff66c8386fd..1243fc0f63f97 100644
--- a/python/paddle/fluid/tests/unittests/collective/testslist.csv
+++ b/python/paddle/fluid/tests/unittests/collective/testslist.csv
@@ -1,7 +1,34 @@
-name,os,arch,timeout,run_type,launcher,dist_ut_port,run_serial,ENVS,conditions
-test_allreduce,linux,gpu;rocm,120,DIST,test_runner.py,20071,1,PYTHONPATH=..;http_proxy=;https_proxy=,
-test_broadcast,linux,gpu;rocm,120,DIST,test_runner.py,20073,1,PYTHONPATH=..;http_proxy=;https_proxy=,
-test_c_concat,linux,gpu;rocm,120,DIST,test_runner.py,20075,1,PYTHONPATH=..;http_proxy=;https_proxy=,
-test_c_identity,linux,gpu;rocm,120,DIST,test_runner.py,20077,1,PYTHONPATH=..;http_proxy=;https_proxy=,
-test_c_split,linux,gpu;rocm,120,DIST,test_runner.py,20079,1,PYTHONPATH=..;http_proxy=;https_proxy=,
-test_collective_split_embedding,linux,rocm;gpu,300,DIST,../dist_test.sh,20081,1,PYTHONPATH=..;http_proxy=;https_proxy=,
+name,os,arch,timeout,run_type,launcher,num_port,run_serial,ENVS,conditions
+test_allreduce,linux,gpu;rocm,120,DIST,test_runner.py,2,1,PYTHONPATH=..;http_proxy=;https_proxy=,
+test_broadcast,linux,gpu;rocm,120,DIST,test_runner.py,2,1,PYTHONPATH=..;http_proxy=;https_proxy=,
+test_c_concat,linux,gpu;rocm,120,DIST,test_runner.py,2,1,PYTHONPATH=..;http_proxy=;https_proxy=,
+test_c_identity,linux,gpu;rocm,120,DIST,test_runner.py,2,1,PYTHONPATH=..;http_proxy=;https_proxy=,
+test_c_split,linux,gpu;rocm,120,DIST,test_runner.py,2,1,PYTHONPATH=..;http_proxy=;https_proxy=,
+test_collective_split_embedding,linux,rocm;gpu,300,DIST,../dist_test.sh,2,1,PYTHONPATH=..;http_proxy=;https_proxy=,
+test_collective_allgather_api,linux,gpu;rocm,300,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_allgather_object_api,linux,gpu;rocm,120,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_allreduce_api,linux,gpu;rocm,120,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_alltoall_api,linux,gpu;rocm,120,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_alltoall_single,linux,gpu;rocm,350,DIST,../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_barrier_api,linux,gpu;rocm,300,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_batch_isend_irecv,linux,gpu;rocm,350,DIST,../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_broadcast_api,linux,gpu;rocm,120,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_cpu_barrier_with_gloo,linux,gpu;rocm,300,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_global_gather,linux,gpu;rocm,200,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_global_scatter,linux,gpu;rocm,200,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_optimizer,linux,gpu;rocm,300,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_process_group,linux,gpu;rocm,350,DIST,../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_reduce,linux,gpu;rocm,300,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_reduce_api,linux,gpu;rocm,300,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_reduce_scatter,linux,gpu;rocm,350,DIST,../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_scatter,linux,gpu;rocm,300,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_scatter_api,linux,gpu;rocm,300,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_sendrecv,linux,gpu;rocm,300,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_sendrecv_api,linux,gpu;rocm,120,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_split_col_linear,linux,gpu;rocm,300,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_split_embedding_none_divisible,linux,gpu;rocm,300,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_split_row_linear,linux,gpu;rocm,300,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_wait,linux,gpu;rocm,300,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_eager_dist_api,linux,gpu;rocm,120,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_new_group_api,linux,gpu;rocm,120,DIST,test_runner.py,2,1,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_gen_nccl_id_op,,gpu;rocm;ASCEND;ASCEND_CL,,DIST,../dist_test.sh,2,1,http_proxy=;https_proxy=;PYTHONPATH=..,
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
index 5ee2238d2d140..b505ac07850ee 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
@@ -30,7 +30,7 @@
 from paddle.fluid.dygraph import declarative, ProgramTranslator
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.fluid.framework import _non_static_mode
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 SEED = 2020
 
@@ -176,7 +176,7 @@ def weight(self, value):
 
     def forward(self, input, label, length=None):
         if _non_static_mode():
-            _, _, _, log_likelihood = _C_ops.linear_chain_crf(
+            _, _, _, log_likelihood = _legacy_C_ops.linear_chain_crf(
                 input, self._transition, label, length, "is_test",
                 self._is_test)
             return log_likelihood
@@ -234,8 +234,8 @@ def weight(self, value):
 
     def forward(self, input, label=None, length=None):
         if _non_static_mode():
-            return _C_ops.crf_decoding(input, self._transition, label, length,
-                                       "is_test", self._is_test)
+            return _legacy_C_ops.crf_decoding(input, self._transition, label,
+                                              length, "is_test", self._is_test)
 
         viterbi_path = self._helper.create_variable_for_type_inference(
             dtype=self._dtype)
@@ -268,11 +268,12 @@ def __init__(self,
 
     def forward(self, input, label, seq_length=None):
         if _non_static_mode():
-            return _C_ops.chunk_eval(input, label, seq_length,
-                                     "num_chunk_types", self.num_chunk_types,
-                                     "chunk_scheme", self.chunk_scheme,
-                                     "excluded_chunk_types",
-                                     self.excluded_chunk_types or [])
+            return _legacy_C_ops.chunk_eval(input, label, seq_length,
+                                            "num_chunk_types",
+                                            self.num_chunk_types,
+                                            "chunk_scheme", self.chunk_scheme,
+                                            "excluded_chunk_types",
+                                            self.excluded_chunk_types or [])
 
         precision = self._helper.create_variable_for_type_inference(
             dtype="float32")
diff --git a/python/paddle/fluid/tests/unittests/fft/test_spectral_op.py b/python/paddle/fluid/tests/unittests/fft/test_spectral_op.py
index 066869750ed07..3d2523a721a13 100644
--- a/python/paddle/fluid/tests/unittests/fft/test_spectral_op.py
+++ b/python/paddle/fluid/tests/unittests/fft/test_spectral_op.py
@@ -26,7 +26,7 @@
 import paddle.static as static
 from numpy.random import random as rand
 from paddle.fluid import Program, program_guard
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 sys.path.append("../")
 from op_test import OpTest
@@ -75,15 +75,15 @@ def class_name(cls, num, params_dict):
 
 
 def fft_c2c_python_api(x, axes, norm, forward):
-    return _C_ops.final_state_fft_c2c(x, axes, norm, forward)
+    return _C_ops.fft_c2c(x, axes, norm, forward)
 
 
 def fft_r2c_python_api(x, axes, norm, forward, onesided):
-    return _C_ops.final_state_fft_r2c(x, axes, norm, forward, onesided)
+    return _C_ops.fft_r2c(x, axes, norm, forward, onesided)
 
 
 def fft_c2r_python_api(x, axes, norm, forward, last_dim_size=0):
-    return _C_ops.final_state_fft_c2r(x, axes, norm, forward, last_dim_size)
+    return _C_ops.fft_c2r(x, axes, norm, forward, last_dim_size)
 
 
 @parameterize(
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer_with_virtual_stage.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer_with_virtual_stage.py
new file mode 100644
index 0000000000000..0ff14ad5f5452
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer_with_virtual_stage.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import os
+import paddle
+from paddle.distributed import fleet
+import paddle.nn as nn
+from paddle.fluid.dygraph.layers import Layer
+from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer
+import paddle.nn.functional as F
+
+
+class ReshapeHelp(Layer):
+
+    def __init__(self, shape):
+        super(ReshapeHelp, self).__init__()
+        self.shape = shape
+
+    def forward(self, x):
+        return x.reshape(shape=self.shape)
+
+
+class MLPForVirtualStageLayerTest(PipelineLayer):
+
+    def __init__(self, num_classes=10, **kwargs):
+        self.num_classes = num_classes
+        decs = [
+            LayerDesc(nn.Linear, 2, self.num_classes),
+            LayerDesc(nn.Linear, self.num_classes, 2),
+            LayerDesc(nn.Linear, 2, self.num_classes),
+            LayerDesc(nn.Linear, self.num_classes, 2),
+            LayerDesc(nn.Linear, 2, self.num_classes),
+            LayerDesc(nn.Linear, self.num_classes, 2),
+            LayerDesc(nn.Linear, 2, self.num_classes),
+            LayerDesc(nn.Linear, self.num_classes, 2),
+        ]
+        super(MLPForVirtualStageLayerTest,
+              self).__init__(layers=decs,
+                             loss_fn=nn.CrossEntropyLoss(),
+                             **kwargs)
+
+
+class TestPipeLayerAPI(unittest.TestCase):
+
+    def setUp(self):
+        strategy = fleet.DistributedStrategy()
+        self.pipeline_parallel_size = 2
+        strategy.hybrid_configs = {
+            "dp_degree": 1,
+            "mp_degree": 1,
+            "pp_degree": self.pipeline_parallel_size
+        }
+        fleet.init(is_collective=True, strategy=strategy)
+        self.rank = fleet.worker_index()
+        self.hcg = fleet.get_hybrid_communicate_group()
+
+    def test_pipelayer_desc(self):
+        pipe_model = MLPForVirtualStageLayerTest(
+            seg_method="layer:Linear",
+            num_stages=self.pipeline_parallel_size,
+            num_virtual_pipeline_stages=2,
+            recompute_interval=1)
+        assert len(pipe_model.parameters()) > 0
+        model_chunks = pipe_model.get_model_chunks()
+        assert model_chunks is not None
+        assert len(model_chunks) == 2
+
+        optimizer = paddle.optimizer.SGD(parameters=pipe_model.parameters())
+
+        try:
+            model_chunks[0](paddle.to_tensor([1., 2.]))
+        except NotImplementedError:
+            pass
+
+        # fake call for the forward function of virtual pipeline layer
+        for i in range(len(model_chunks)):
+            out = pipe_model(paddle.to_tensor([1., 2.]), chunk_id=i)
+            assert list(out.shape) == [2]
+            out = F.relu(out)
+            loss = paddle.mean(out)
+            loss.backward()
+
+        optimizer.step()
+
+        # just make sure the model can be wrapped with distributed model
+        dist_model = fleet.distributed_model(pipe_model)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_squared_l2_norm_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_squared_l2_norm_op_mlu.py
index 7dc668dfe56f6..e8418ce244146 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_squared_l2_norm_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_squared_l2_norm_op_mlu.py
@@ -22,7 +22,7 @@
 sys.path.append('..')
 from op_test import OpTest
 import paddle
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 paddle.enable_static()
 
@@ -57,8 +57,8 @@ def check_place(self, place):
         with paddle.fluid.dygraph.guard(place):
             x_np = np.random.rand(5, 11, 13).astype('float32')
             x = paddle.to_tensor(x_np)
-            y1 = _C_ops.squared_l2_norm(x)
-            y2 = _C_ops.squared_l2_norm(x)
+            y1 = _legacy_C_ops.squared_l2_norm(x)
+            y2 = _legacy_C_ops.squared_l2_norm(x)
             np.testing.assert_allclose(y1.numpy(), y2.numpy())
 
     def test_main(self):
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_truncated_gaussian_random_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_truncated_gaussian_random_op_mlu.py
index e1752029ef97c..215430585da73 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_truncated_gaussian_random_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_truncated_gaussian_random_op_mlu.py
@@ -80,7 +80,7 @@ def gaussian_random_test(self, place):
     def gaussian_random_test_eager(self, place):
         with fluid.dygraph.guard(place):
             with _test_eager_guard():
-                out = paddle._C_ops.final_state_truncated_gaussian_random(
+                out = paddle._C_ops.truncated_gaussian_random(
                     self.attrs["shape"], self.attrs["mean"], self.attrs["std"],
                     self.attrs["seed"], core.VarDesc.VarType.FP32, place)
                 self.assertAlmostEqual(numpy.mean(out.numpy()), .0, delta=0.1)
diff --git a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
index 0c868e4568240..57e52206653c8 100644
--- a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
@@ -6,8 +6,7 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
 if(WITH_ASCEND_CL)
   foreach(TEST_OP ${TEST_OPS})
-    py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS
-                    FLAGS_USE_STANDALONE_EXECUTOR=0)
+    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
   endforeach()
 
   # NOTE: NPU `get_float_status` read the value from register, During the test,
diff --git a/python/paddle/fluid/tests/unittests/npu/test_float_status_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_float_status_op_npu.py
index 71764aad47c22..3db98ea8f3849 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_float_status_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_float_status_op_npu.py
@@ -19,7 +19,7 @@
 sys.path.append("..")
 from op_test import OpTest, skip_check_grad_ci
 import paddle
-import paddle._C_ops as ops
+import paddle._legacy_C_ops as ops
 
 
 class TestGetFloatStatusOp(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_run_program_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_run_program_op_npu.py
index a0bb1ca8af386..c7439cee7c60a 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_run_program_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_run_program_op_npu.py
@@ -24,7 +24,7 @@
 
 from op_test import OpTest
 import paddle
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 import paddle.fluid as fluid
 from paddle import compat as cpt
 from paddle.fluid import core, framework, executor
@@ -201,9 +201,9 @@ def calc_dygraph_output(self, place):
             inputs = self.prepare_dygraph_input(place)
             outputs = self.prepare_dygraph_output()
 
-            _C_ops.run_program(inputs['X'], inputs['Params'], outputs['Out'],
-                               outputs['OutScope'], outputs['DOut'], None,
-                               *self.attrs)
+            _legacy_C_ops.run_program(inputs['X'], inputs['Params'],
+                                      outputs['Out'], outputs['OutScope'],
+                                      outputs['DOut'], None, *self.attrs)
             return outputs['Out']
 
     def calc_dygraph_grad(self, place):
@@ -215,9 +215,9 @@ def calc_dygraph_grad(self, place):
             inputs, input_param_list = self.prepare_dygraph_input(place, True)
             outputs = self.prepare_dygraph_output()
 
-            _C_ops.run_program(inputs['X'], inputs['Params'], outputs['Out'],
-                               outputs['OutScope'], outputs['DOut'], None,
-                               *self.attrs)
+            _legacy_C_ops.run_program(inputs['X'], inputs['Params'],
+                                      outputs['Out'], outputs['OutScope'],
+                                      outputs['DOut'], None, *self.attrs)
 
             for param in input_param_list:
                 var_type = self._get_grad_vartype(param.name)
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 958b19772410f..b849d6e6f5973 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -3117,7 +3117,7 @@ def test_static_api(self):
         for r in res:
             np.testing.assert_allclose(out_ref, r, rtol=1e-05)
 
-    def test_dygraph_api(self):
+    def func_test_dygraph_api(self):
         paddle.disable_static(self.place)
         x = paddle.to_tensor(self.x_np)
         out1 = F.swish(x)
@@ -3128,9 +3128,10 @@ def test_dygraph_api(self):
             np.testing.assert_allclose(out_ref, r.numpy(), rtol=1e-05)
         paddle.enable_static()
 
-    def test_dygraph_final_state_api(self):
+    def test_dygraph_api(self):
         with _test_eager_guard():
-            self.test_dygraph_api()
+            self.func_test_dygraph_api()
+        self.func_test_dygraph_api()
 
     def test_fluid_api(self):
         paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_bicubic_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_bicubic_interp_v2_op.py
index 98560742135c3..b8b7786fa9531 100644
--- a/python/paddle/fluid/tests/unittests/test_bicubic_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bicubic_interp_v2_op.py
@@ -48,11 +48,10 @@ def bicubic_interp_test(x,
         if not isinstance(SizeTensor, list) and not isinstance(
                 SizeTensor, tuple):
             SizeTensor = [SizeTensor]
-    return paddle._C_ops.final_state_bicubic_interp(x, OutSize, SizeTensor,
-                                                    Scale, data_layout, out_d,
-                                                    out_h, out_w, scale,
-                                                    interp_method,
-                                                    align_corners, align_mode)
+    return paddle._C_ops.bicubic_interp(x, OutSize, SizeTensor, Scale,
+                                        data_layout, out_d, out_h, out_w, scale,
+                                        interp_method, align_corners,
+                                        align_mode)
 
 
 def cubic_1(x, a):
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py
index 9ef3a508fc358..bc5fbb96899f4 100755
--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py
@@ -46,11 +46,10 @@ def bilinear_interp_test(x,
         if not isinstance(SizeTensor, list) and not isinstance(
                 SizeTensor, tuple):
             SizeTensor = [SizeTensor]
-    return paddle._C_ops.final_state_bilinear_interp(x, OutSize, SizeTensor,
-                                                     Scale, data_layout, out_d,
-                                                     out_h, out_w, scale,
-                                                     interp_method,
-                                                     align_corners, align_mode)
+    return paddle._C_ops.bilinear_interp(x, OutSize, SizeTensor, Scale,
+                                         data_layout, out_d, out_h, out_w,
+                                         scale, interp_method, align_corners,
+                                         align_mode)
 
 
 def bilinear_interp_np(input,
diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
index 0c526182ab78f..d36ff2bb430a3 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
@@ -1470,7 +1470,7 @@ def test_cross_entropy_loss_2d_sum(self):
         np.testing.assert_allclose(static_ret[0], expected, rtol=1e-05)
         np.testing.assert_allclose(dy_ret_value, expected, rtol=1e-05)
 
-    def test_soft_1d_dygraph_final_state_api(self):
+    def test_soft_1d_dygraph_api(self):
         with _test_eager_guard():
             self.test_cross_entropy_loss_soft_1d()
             self.test_cross_entropy_loss_soft_1d_weight()
@@ -1478,12 +1478,12 @@ def test_soft_1d_dygraph_final_state_api(self):
             self.test_cross_entropy_loss_soft_1d_weight_mean()
 
     # put all testcases in one test will be failed
-    def test_soft_2d_dygraph_final_state_api(self):
+    def test_soft_2d_dygraph_api(self):
         with _test_eager_guard():
             self.test_cross_entropy_loss_soft_2d()
             self.test_cross_entropy_loss_soft_2d_weight_mean()
 
-    def test_other_dygraph_final_state_api(self):
+    def test_other_dygraph_api(self):
         with _test_eager_guard():
             self.test_cross_entropy_loss_1d_with_mean_ignore()
             self.test_cross_entropy_loss_1d_with_mean_ignore_negative()
diff --git a/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py b/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
index dc9991c3836f7..202fa349090ff 100755
--- a/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
+++ b/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
@@ -23,7 +23,7 @@
 import sys
 import warnings
 import paddle.utils.deprecated as deprecated
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 LOWEST_WARNING_POSTION = 3
 ERROR_WARNING_POSTION = sys.maxsize
@@ -141,7 +141,7 @@ def test_ops_elementwise_mul(self):
         b = np.random.uniform(0.1, 1, [51, 76]).astype(np.float32)
         x = paddle.to_tensor(a)
         y = paddle.to_tensor(b)
-        res = _C_ops.elementwise_mul(x, y)
+        res = _legacy_C_ops.elementwise_mul(x, y)
 
         # expected
         expected = LOWEST_WARNING_POSTION
diff --git a/python/paddle/fluid/tests/unittests/test_dropout_nd_op.py b/python/paddle/fluid/tests/unittests/test_dropout_nd_op.py
index 9b8520dfdbc3e..cb92168ffc76f 100644
--- a/python/paddle/fluid/tests/unittests/test_dropout_nd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_nd_op.py
@@ -22,7 +22,7 @@
 import paddle.fluid.core as core
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.framework import _non_static_mode
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.static import default_main_program
 
 
@@ -39,12 +39,12 @@ def dropout_nd(x,
         if default_main_program().random_seed != 0:
             seed = default_main_program().random_seed
 
-        out, mask = _C_ops.dropout_nd(x, 'dropout_prob', p, 'is_test',
-                                      not training, 'fix_seed', seed
-                                      is not None, 'seed',
-                                      seed if seed is not None else 0,
-                                      'dropout_implementation', mode, 'axis',
-                                      drop_axes)
+        out, mask = _legacy_C_ops.dropout_nd(x, 'dropout_prob', p, 'is_test',
+                                             not training, 'fix_seed', seed
+                                             is not None, 'seed',
+                                             seed if seed is not None else 0,
+                                             'dropout_implementation', mode,
+                                             'axis', drop_axes)
         return out
 
     helper = LayerHelper('dropout_nd', **locals())
diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py
index 3128fd4d0406a..eb696420fe03b 100644
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
@@ -25,7 +25,7 @@
 from paddle.fluid.framework import _test_eager_guard, _enable_legacy_dygraph
 import os
 
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 
 class TestDropoutOp(OpTest):
@@ -1031,8 +1031,8 @@ def test_backward_downscale_in_infer_eager(self):
                 with _test_eager_guard():
                     input = paddle.uniform([40, 40], dtype="float32")
                     input.stop_gradient = False
-                    out, mask = _C_ops.final_state_dropout(
-                        input, None, 0.5, False, "downgrade_in_infer", 0, False)
+                    out, mask = _C_ops.dropout(input, None, 0.5, False,
+                                               "downgrade_in_infer", 0, False)
                     out.backward()
                     np.testing.assert_array_equal(
                         input.gradient(),
@@ -1063,8 +1063,8 @@ def test_backward_upscale_train_eager(self):
                     prob = 0.5
                     input = paddle.uniform([40, 40], dtype="float32")
                     input.stop_gradient = False
-                    out, mask = _C_ops.final_state_dropout(
-                        input, None, 0.5, False, "upscale_in_train", 0, False)
+                    out, mask = _C_ops.dropout(input, None, 0.5, False,
+                                               "upscale_in_train", 0, False)
                     out.backward()
 
                     np.testing.assert_allclose(input.gradient(),
@@ -1098,8 +1098,8 @@ def test_backward_upscale_train_2_eager(self):
                     prob = 0.3
                     input = paddle.uniform([40, 40], dtype="float32")
                     input.stop_gradient = False
-                    out, mask = _C_ops.final_state_dropout(
-                        input, None, 0.3, False, "upscale_in_train", 0, False)
+                    out, mask = _C_ops.dropout(input, None, 0.3, False,
+                                               "upscale_in_train", 0, False)
 
                     out.backward()
 
diff --git a/python/paddle/fluid/tests/unittests/test_eager_run_program.py b/python/paddle/fluid/tests/unittests/test_eager_run_program.py
index a04c544e90257..ba4c9a9452c9b 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_run_program.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_run_program.py
@@ -14,7 +14,7 @@
 
 import paddle
 import numpy as np
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid.framework import _test_eager_guard, Variable, _in_legacy_dygraph
 from paddle.fluid import core
 from paddle.fluid.layers.utils import _hash_with_id
@@ -102,8 +102,8 @@ def test_eager(self):
                      'end_op_index', main_program.desc.block(0).op_size(),
                      'is_test', False, 'program_id', _hash_with_id(program))
 
-            _C_ops.run_program([x_t, y_t], [fake_var], [out_t], [scope],
-                               [fake_var], None, *attrs)
+            _legacy_C_ops.run_program([x_t, y_t], [fake_var], [out_t], [scope],
+                                      [fake_var], None, *attrs)
 
             loss = paddle.mean(out_t)
             loss.backward()
diff --git a/python/paddle/fluid/tests/unittests/test_eager_trace_op.py b/python/paddle/fluid/tests/unittests/test_eager_trace_op.py
index 1266e1c9a6a6e..910edf7a25985 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_trace_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_trace_op.py
@@ -19,7 +19,7 @@
 from op_test import OpTest
 import paddle
 import paddle.fluid.core as core
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
 from paddle.fluid.framework import _test_eager_guard
diff --git a/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py b/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py
index d14a967b87925..e3e9db14db93a 100755
--- a/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py
+++ b/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py
@@ -24,7 +24,7 @@
 from paddle.dataset.common import DATA_HOME
 from paddle.fluid.framework import core, _non_static_mode, _test_eager_guard
 from paddle.fluid.layer_helper import LayerHelper
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 import sys
 import tempfile
@@ -79,7 +79,7 @@ def forward(self,
                 is_split_into_words=False,
                 pad_to_max_seq_len=False):
         if _non_static_mode():
-            input_ids, seg_ids = _C_ops.faster_tokenizer(
+            input_ids, seg_ids = _legacy_C_ops.faster_tokenizer(
                 self.vocab, text, text_pair, "do_lower_case", do_lower_case,
                 "max_seq_len", max_seq_len, "pad_to_max_seq_len",
                 pad_to_max_seq_len, "is_split_into_words", is_split_into_words)
diff --git a/python/paddle/fluid/tests/unittests/test_function_hook.py b/python/paddle/fluid/tests/unittests/test_function_hook.py
index 8c88ee06c1efd..64e099460eb32 100644
--- a/python/paddle/fluid/tests/unittests/test_function_hook.py
+++ b/python/paddle/fluid/tests/unittests/test_function_hook.py
@@ -19,7 +19,7 @@
 import numpy as np
 
 import paddle.fluid.core as core
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid.framework import _test_eager_guard
 
 
@@ -50,8 +50,8 @@ def func_hook(self):
         x = paddle.to_tensor(input_data.astype(np.float32), stop_gradient=False)
         z = paddle.to_tensor(input_data.astype(np.float32), stop_gradient=False)
 
-        y = _C_ops.sigmoid(x)
-        out = _C_ops.matmul_v2(y, z, 'trans_x', False, 'trans_y', False)
+        y = _legacy_C_ops.sigmoid(x)
+        out = _legacy_C_ops.matmul_v2(y, z, 'trans_x', False, 'trans_y', False)
 
         out._register_void_function_post_hook(test_hook)
         y._register_void_function_post_hook(test_hook)
diff --git a/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py b/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py
index 8b8d378e5c8fa..f911d614ee49b 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py
@@ -25,7 +25,7 @@
 import unittest
 from op_test import OpTest, convert_float_to_uint16, convert_uint16_to_float
 from test_sparse_attention_op import get_cuda_version
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid.framework import default_main_program
 from paddle.fluid import core
 
@@ -232,7 +232,7 @@ def get_fused_gate_attention_out(self):
         output_w = paddle.to_tensor(self.output_w, stop_gradient=False)
         output_b = paddle.to_tensor(self.output_b, stop_gradient=False)
 
-        _, _, _, _, softmax_out, fmha_out, gate_out, out = _C_ops.fused_gate_attention(
+        _, _, _, _, softmax_out, fmha_out, gate_out, out = _legacy_C_ops.fused_gate_attention(
             query, key, q_weight, k_weight, v_weight, qkv_weight,
             nonbatched_bias, src_mask, gating_w, gating_b, output_w, output_b,
             'has_gating', self.has_gating, 'merge_qkv', self.merge_qkv)
diff --git a/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py b/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py
index 65276e9c92e96..47851c895fa63 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py
@@ -32,7 +32,7 @@
 from paddle.nn.initializer import Constant
 from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
 from paddle.fluid.framework import _non_static_mode, default_main_program
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.incubate.nn.functional import fused_multi_transformer
 
 default_main_program().random_seed = 42
diff --git a/python/paddle/fluid/tests/unittests/test_group_norm_op.py b/python/paddle/fluid/tests/unittests/test_group_norm_op.py
index 0a810735b31fc..6a3b94825e509 100644
--- a/python/paddle/fluid/tests/unittests/test_group_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_group_norm_op.py
@@ -303,7 +303,7 @@ def attr_data_format():
 
 class TestGroupNormEager(unittest.TestCase):
 
-    def test_dygraph_final_state_api(self):
+    def test_dygraph_api(self):
         self.dtype = np.float64
         self.shape = (8, 32, 32)
         input = np.random.random(self.shape).astype(self.dtype)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_container_parameterlist.py b/python/paddle/fluid/tests/unittests/test_imperative_container_parameterlist.py
index 97101c619ce6d..ace46712d734d 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_container_parameterlist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_container_parameterlist.py
@@ -18,7 +18,7 @@
 import paddle.fluid as fluid
 import numpy as np
 import paddle
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid.framework import _test_eager_guard
 
 
@@ -44,7 +44,7 @@ def paddle_imperative_ParameterList(self, num_stacked_param):
 
     def forward(self, x):
         for i, p in enumerate(self.params):
-            x = _C_ops.mul(x, p)
+            x = _legacy_C_ops.mul(x, p)
         return x
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
index b8ea449c2b254..5b08c6167925e 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
@@ -16,7 +16,7 @@
 import paddle.fluid as fluid
 import numpy as np
 import unittest
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid.framework import _test_eager_guard, _in_legacy_dygraph, _in_eager_without_dygraph_check
 
 if fluid.is_compiled_with_cuda():
@@ -117,8 +117,9 @@ def __init__(self, num_channels, epsilon=1e-5):
 
     def forward(self, input):
         if fluid._non_static_mode():
-            out, _, _ = _C_ops.instance_norm(input, self.scale, self.bias,
-                                             'epsilon', self.epsilon)
+            out, _, _ = _legacy_C_ops.instance_norm(input, self.scale,
+                                                    self.bias, 'epsilon',
+                                                    self.epsilon)
             return out
         else:
             return fluid.layers.instance_norm(
diff --git a/python/paddle/fluid/tests/unittests/test_inplace_and_clear_gradient.py b/python/paddle/fluid/tests/unittests/test_inplace_and_clear_gradient.py
index 7ec04ed90b0ae..58a35a71199e8 100644
--- a/python/paddle/fluid/tests/unittests/test_inplace_and_clear_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace_and_clear_gradient.py
@@ -15,7 +15,7 @@
 import numpy as np
 import paddle
 import paddle.fluid as fluid
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 import unittest
 
 paddle.disable_static()
@@ -26,7 +26,7 @@ def clear_grad(w, a):
     @paddle.no_grad()
     def warp(*_):
         assert w.grad is not None
-        _C_ops.scale_(w.grad, 'scale', 0.5)
+        _legacy_C_ops.scale_(w.grad, 'scale', 0.5)
         w.clear_gradient(False)
 
     return warp
@@ -44,7 +44,7 @@ def test(self):
         w._register_backward_hook(_clear_grad)
 
         for i in range(10):
-            out = _C_ops.scale(w, 'scale', 0.1)
+            out = _legacy_C_ops.scale(w, 'scale', 0.1)
             out.backward()
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_linear_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_linear_interp_v2_op.py
index ed6b833956b1a..18861d125ed65 100755
--- a/python/paddle/fluid/tests/unittests/test_linear_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_linear_interp_v2_op.py
@@ -47,11 +47,9 @@ def linear_interp_test(x,
         if not isinstance(SizeTensor, list) and not isinstance(
                 SizeTensor, tuple):
             SizeTensor = [SizeTensor]
-    return paddle._C_ops.final_state_linear_interp(x, OutSize, SizeTensor,
-                                                   Scale, data_layout, out_d,
-                                                   out_h, out_w, scale,
-                                                   interp_method, align_corners,
-                                                   align_mode)
+    return paddle._C_ops.linear_interp(x, OutSize, SizeTensor, Scale,
+                                       data_layout, out_d, out_h, out_w, scale,
+                                       interp_method, align_corners, align_mode)
 
 
 def linear_interp_np(input,
diff --git a/python/paddle/fluid/tests/unittests/test_maxout_op.py b/python/paddle/fluid/tests/unittests/test_maxout_op.py
index 5e28d52f93f3d..7d3702389a80c 100644
--- a/python/paddle/fluid/tests/unittests/test_maxout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_maxout_op.py
@@ -109,7 +109,7 @@ def test_static_api(self):
         for r in res:
             np.testing.assert_allclose(out_ref, r, rtol=1e-05)
 
-    def test_dygraph_api(self):
+    def func_test_dygraph_api(self):
         paddle.disable_static(self.place)
         x = paddle.to_tensor(self.x_np)
         out1 = F.maxout(x, self.groups, self.axis)
@@ -152,9 +152,10 @@ def test_errors(self):
             x_float32 = paddle.fluid.data(name='x_float32', shape=[2, 4, 6, 8])
             self.assertRaises(ValueError, F.maxout, x_float32, 2, 2)
 
-    def test_dygraph_final_state_api(self):
+    def test_dygraph_api(self):
         with _test_eager_guard():
-            self.test_dygraph_api()
+            self.func_test_dygraph_api()
+        self.func_test_dygraph_api()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_merged_adam_op.py b/python/paddle/fluid/tests/unittests/test_merged_adam_op.py
index 84f3cc7efb680..21749a92f31be 100644
--- a/python/paddle/fluid/tests/unittests/test_merged_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_merged_adam_op.py
@@ -15,7 +15,7 @@
 import unittest
 import paddle
 import numpy as np
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 
 def run_adam_op(params,
@@ -55,7 +55,7 @@ def run_adam_op(params,
 
     if not use_merged:
         for i in range(len(param_vars)):
-            _, _, _, _, _, _ = _C_ops.adam(
+            _, _, _, _, _, _ = _legacy_C_ops.adam(
                 param_vars[i], grad_vars[i], lr_vars[i], moment1_vars[i],
                 moment2_vars[i], beta1_pow_vars[i], beta2_pow_vars[i],
                 master_param_vars[i], param_vars[i], moment1_vars[i],
@@ -63,7 +63,7 @@ def run_adam_op(params,
                 master_param_vars[i], 'epsilon', epsilon, 'beta1', beta1,
                 'beta2', beta2, 'multi_precision', multi_precision)
     else:
-        _, _, _, _, _, _ = _C_ops.merged_adam(
+        _, _, _, _, _, _ = _legacy_C_ops.merged_adam(
             param_vars, grad_vars, lr_vars, moment1_vars, moment2_vars,
             beta1_pow_vars, beta2_pow_vars, master_param_vars, param_vars,
             moment1_vars, moment2_vars, beta1_pow_vars, beta2_pow_vars,
diff --git a/python/paddle/fluid/tests/unittests/test_meshgrid_op.py b/python/paddle/fluid/tests/unittests/test_meshgrid_op.py
index 8e76859c880d4..9a85d9cb33bd3 100644
--- a/python/paddle/fluid/tests/unittests/test_meshgrid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_meshgrid_op.py
@@ -231,7 +231,7 @@ def test_api_eager_dygraph(self):
 
 class TestMeshgridEager(unittest.TestCase):
 
-    def test_dygraph_final_state_api(self):
+    def test_dygraph_api(self):
         input_1 = np.random.randint(0, 100, [
             100,
         ]).astype('int32')
diff --git a/python/paddle/fluid/tests/unittests/test_multi_dot_op.py b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py
index 4265a1c8d5782..8e58aeb2c1e92 100644
--- a/python/paddle/fluid/tests/unittests/test_multi_dot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py
@@ -282,7 +282,7 @@ def test_dygraph_without_out(self):
         expected_result = np.linalg.multi_dot([input_array1, input_array2])
         np.testing.assert_allclose(expected_result, out.numpy(), rtol=1e-05)
 
-    def test_dygraph_final_state_api(self):
+    def test_dygraph_api(self):
         with _test_eager_guard():
             self.test_dygraph_without_out()
 
diff --git a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
index e076cc9277846..9aaf62028766b 100644
--- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
@@ -21,7 +21,7 @@
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard, in_dygraph_mode, _non_static_mode
 from paddle.fluid.layer_helper import LayerHelper
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 
 def multiclass_nms3(bboxes,
@@ -43,7 +43,7 @@ def multiclass_nms3(bboxes,
     if in_dygraph_mode():
         attrs = (score_threshold, nms_top_k, keep_top_k, nms_threshold,
                  normalized, nms_eta, background_label)
-        output, index, nms_rois_num = _C_ops.final_state_multiclass_nms3(
+        output, index, nms_rois_num = _C_ops.multiclass_nms3(
             bboxes, scores, rois_num, *attrs)
         if not return_index:
             index = None
@@ -53,7 +53,7 @@ def multiclass_nms3(bboxes,
                  score_threshold, 'nms_top_k', nms_top_k, 'nms_threshold',
                  nms_threshold, 'keep_top_k', keep_top_k, 'nms_eta', nms_eta,
                  'normalized', normalized)
-        output, index, nms_rois_num = _C_ops.multiclass_nms3(
+        output, index, nms_rois_num = _legacy_C_ops.multiclass_nms3(
             bboxes, scores, rois_num, *attrs)
         if not return_index:
             index = None
diff --git a/python/paddle/fluid/tests/unittests/test_multiplex_op.py b/python/paddle/fluid/tests/unittests/test_multiplex_op.py
index 29a11ab68d0d0..05d41c04e6d98 100644
--- a/python/paddle/fluid/tests/unittests/test_multiplex_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiplex_op.py
@@ -107,7 +107,7 @@ def test_multiplex_dygraph(self):
         res = paddle.multiplex(inputs, index)
         paddle.enable_static()
 
-    def test_dygraph_final_state_api(self):
+    def test_dygraph_api(self):
         with fluid.dygraph.guard():
             img1 = np.array([[1, 2], [3, 4]]).astype(np.float32)
             img2 = np.array([[5, 6], [7, 8]]).astype(np.float32)
diff --git a/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
index ae58a7fe900e8..40699c99eb810 100755
--- a/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
@@ -49,11 +49,10 @@ def nearest_interp_test(x,
         if not isinstance(SizeTensor, list) and not isinstance(
                 SizeTensor, tuple):
             SizeTensor = [SizeTensor]
-    return paddle._C_ops.final_state_nearest_interp(x, OutSize, SizeTensor,
-                                                    Scale, data_layout, out_d,
-                                                    out_h, out_w, scale,
-                                                    interp_method,
-                                                    align_corners, align_mode)
+    return paddle._C_ops.nearest_interp(x, OutSize, SizeTensor, Scale,
+                                        data_layout, out_d, out_h, out_w, scale,
+                                        interp_method, align_corners,
+                                        align_mode)
 
 
 def nearest_neighbor_interp_np(X,
diff --git a/python/paddle/fluid/tests/unittests/test_norm_all.py b/python/paddle/fluid/tests/unittests/test_norm_all.py
index c65bff3a7bb39..8557219e52646 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_all.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_all.py
@@ -20,7 +20,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid.framework import in_dygraph_mode, _in_legacy_dygraph
 
 
@@ -32,12 +32,11 @@ def p_norm_python_api(x,
                       keepdim=False,
                       as_vector=False):
     if in_dygraph_mode():
-        return _C_ops.final_state_p_norm(x, p, axis, epsilon, keepdim,
-                                         as_vector)
+        return _C_ops.p_norm(x, p, axis, epsilon, keepdim, as_vector)
     if _in_legacy_dygraph():
-        return _C_ops.p_norm(x, 'axis', axis, 'porder', float(p), 'keepdim',
-                             keepdim, 'epsilon', epsilon, 'as_vector',
-                             as_vector)
+        return _legacy_C_ops.p_norm(x, 'axis', axis, 'porder', float(p),
+                                    'keepdim', keepdim, 'epsilon', epsilon,
+                                    'as_vector', as_vector)
 
 
 def p_norm(x, axis, porder, keepdims=False, reduce_all=False):
@@ -78,7 +77,7 @@ def p_norm(x, axis, porder, keepdims=False, reduce_all=False):
     return r
 
 
-def frobenius_norm(x, axis=None, keepdims=False):
+def numpy_frobenius_norm(x, axis=None, keepdims=False):
     if isinstance(axis, list): axis = tuple(axis)
     if axis is None: x = x.reshape(1, x.size)
     r = np.linalg.norm(x, ord='fro', axis=axis,
@@ -86,18 +85,18 @@ def frobenius_norm(x, axis=None, keepdims=False):
     return r
 
 
-def final_state_frobenius_norm(x, dim, keep_dim, reduce_all):
+def frobenius_norm(x, dim, keep_dim, reduce_all):
     return paddle.linalg.norm(x, p='fro', axis=dim, keepdim=keep_dim)
 
 
 class TestFrobeniusNormOp(OpTest):
 
     def setUp(self):
-        self.python_api = final_state_frobenius_norm
+        self.python_api = frobenius_norm
         self.op_type = "frobenius_norm"
         self.init_test_case()
         x = (np.random.random(self.shape) + 1.0).astype(self.dtype)
-        norm = frobenius_norm(x, self.axis, self.keepdim)
+        norm = numpy_frobenius_norm(x, self.axis, self.keepdim)
         self.reduce_all = (len(self.axis) == len(self.shape))
         self.inputs = {'X': x}
         self.attrs = {
@@ -414,7 +413,9 @@ def run_fro(self, p, axis, shape_x, dtype, keep_dim, check_dim=False):
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
         np_input = (np.random.rand(*shape_x) + 1.0).astype(dtype)
-        expected_result = frobenius_norm(np_input, axis=axis, keepdims=keep_dim)
+        expected_result = numpy_frobenius_norm(np_input,
+                                               axis=axis,
+                                               keepdims=keep_dim)
         result, = exe.run(feed={"X": np_input}, fetch_list=[out])
     self.assertEqual((np.abs(result - expected_result) < 1e-6).all(), True)
     if keep_dim and check_dim:
diff --git a/python/paddle/fluid/tests/unittests/test_ones_like.py b/python/paddle/fluid/tests/unittests/test_ones_like.py
index 4c1394caaee04..fa5abbef8b87a 100644
--- a/python/paddle/fluid/tests/unittests/test_ones_like.py
+++ b/python/paddle/fluid/tests/unittests/test_ones_like.py
@@ -18,7 +18,7 @@
 import numpy as np
 import paddle
 import paddle.fluid as fluid
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle import ones_like
 from paddle.fluid import core, Program, program_guard
 from paddle.fluid.framework import convert_np_dtype_to_dtype_
@@ -90,9 +90,7 @@ def test_api(self):
         paddle.disable_static(place)
 
         for dtype in [np.float32, np.float64, np.int32, np.int64]:
-            out = _C_ops.final_state_ones(shape,
-                                          convert_np_dtype_to_dtype_(dtype),
-                                          place)
+            out = _C_ops.ones(shape, convert_np_dtype_to_dtype_(dtype), place)
             self.assertEqual((out.numpy() == np.ones(shape, dtype)).all(), True)
 
         paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_op_function_generator.py b/python/paddle/fluid/tests/unittests/test_op_function_generator.py
index 82fe1e1781d7e..65e4856b39aa8 100644
--- a/python/paddle/fluid/tests/unittests/test_op_function_generator.py
+++ b/python/paddle/fluid/tests/unittests/test_op_function_generator.py
@@ -21,7 +21,7 @@
 import paddle.fluid.core as core
 from paddle.fluid.dygraph.jit import TracedLayer
 import numpy as np
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 
 class TestTracedLayer(fluid.dygraph.Layer):
@@ -30,7 +30,7 @@ def __init__(self, name_scope):
         super(TestTracedLayer, self).__init__(name_scope)
 
     def forward(self, input):
-        return _C_ops.relu(input)
+        return _legacy_C_ops.relu(input)
 
 
 class TestVariable(unittest.TestCase):
@@ -49,7 +49,7 @@ def test_elementwise_add(self):
             x.stop_gradient = False
 
             res1 = layers.elementwise_add(x, y)
-            res2 = _C_ops.elementwise_add(x, y)
+            res2 = _legacy_C_ops.elementwise_add(x, y)
 
             np.testing.assert_array_equal(res1.numpy(), res2.numpy())
 
@@ -61,7 +61,7 @@ def test_elementwise_mul(self):
             y = fluid.dygraph.to_variable(b)
 
             res1 = layers.elementwise_mul(x, y)
-            res2 = _C_ops.elementwise_mul(x, y)
+            res2 = _legacy_C_ops.elementwise_mul(x, y)
 
             np.testing.assert_array_equal(res1.numpy(), res2.numpy())
 
@@ -71,7 +71,7 @@ def test_relu(self):
             x = fluid.dygraph.to_variable(a)
 
             res1 = layers.relu(x)
-            res2 = _C_ops.relu(x)
+            res2 = _legacy_C_ops.relu(x)
 
             np.testing.assert_array_equal(res1.numpy(), res2.numpy())
 
@@ -85,7 +85,7 @@ def test_trace_backward(self):
             x.stop_gradient = False
             y.stop_gradient = False
 
-            loss = _C_ops.elementwise_mul(x, y)
+            loss = _legacy_C_ops.elementwise_mul(x, y)
 
             loss.backward()
             x_grad = x.gradient()
diff --git a/python/paddle/fluid/tests/unittests/test_operator.py b/python/paddle/fluid/tests/unittests/test_operator.py
index a3ca52f78c97c..923a7f21e2ae9 100644
--- a/python/paddle/fluid/tests/unittests/test_operator.py
+++ b/python/paddle/fluid/tests/unittests/test_operator.py
@@ -16,6 +16,8 @@
 
 import unittest
 
+import numpy as np
+
 import paddle.fluid.op as op
 import paddle.fluid.proto.framework_pb2 as framework_pb2
 
@@ -152,6 +154,7 @@ def __add_attr__(name, type):
 
         __add_attr__("int_attr", framework_pb2.INT)
         __add_attr__("float_attr", framework_pb2.FLOAT)
+        __add_attr__("float64_attr", framework_pb2.FLOAT64)
         __add_attr__("string_attr", framework_pb2.STRING)
         __add_attr__("ints_attr", framework_pb2.INTS)
         __add_attr__("floats_attr", framework_pb2.FLOATS)
@@ -165,6 +168,7 @@ def __add_attr__(name, type):
         generated = method(X="a",
                            int_attr=10,
                            float_attr=3.2,
+                           float64_attr=np.finfo("float64").max,
                            string_attr="test_str",
                            ints_attr=[0, 1, 2, 3, 4],
                            floats_attr=[0.2, 3.2, 4.5],
@@ -187,6 +191,11 @@ def __add_attr__(name, type):
         attr.type = framework_pb2.FLOAT
         attr.f = 3.2
 
+        attr = expected.attrs.add()
+        attr.name = "float64_attr"
+        attr.type = framework_pb2.FLOAT64
+        attr.float64 = np.finfo("float64").max
+
         attr = expected.attrs.add()
         attr.name = "string_attr"
         attr.type = framework_pb2.STRING
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel_with_virtual_stage.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel_with_virtual_stage.py
new file mode 100644
index 0000000000000..7011b4507e9b2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel_with_virtual_stage.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid as fluid
+import os
+
+from test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestHybridPipeParallelWithVirtualStage(TestMultipleGpus):
+
+    def test_hybrid_parallel_pp_layer_with_virtual_stage(self):
+        self.run_mnist_2gpu('hybrid_parallel_pp_layer_with_virtual_stage.py')
+        self.run_mnist_2gpu('hybrid_parallel_pp_layer_with_virtual_stage.py',
+                            eager_mode=False)
+
+
+if __name__ == "__main__":
+    os.environ["FLAGS_enable_eager_mode"] = "1"
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool1d_api.py b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
index 665cdfbd31c77..f3a5f96d62e7c 100644
--- a/python/paddle/fluid/tests/unittests/test_pool1d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
@@ -277,7 +277,7 @@ def test_pool1d(self):
             self.check_avg_dygraph_padding_same(place)
             self.check_max_dygraph_return_index_results(place)
 
-    def test_dygraph_final_state_api(self):
+    def test_dygraph_api(self):
         with _test_eager_guard():
             self.test_pool1d()
 
@@ -409,7 +409,7 @@ def run_stride_out_of_range():
 
         self.assertRaises(ValueError, run_stride_out_of_range)
 
-    def test_dygraph_final_state_api(self):
+    def test_dygraph_api(self):
         with _test_eager_guard():
             self.test_error_api()
 
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_api.py b/python/paddle/fluid/tests/unittests/test_pool2d_api.py
index 4a4e03ed99a89..5434f53070f27 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_api.py
@@ -343,7 +343,7 @@ def test_pool2d(self):
             self.check_max_dygraph_ceilmode_results(place)
             self.check_max_dygraph_nhwc_results(place)
 
-    def test_dygraph_final_state_api(self):
+    def test_dygraph_api(self):
         with _test_eager_guard():
             self.test_pool2d()
 
@@ -539,7 +539,7 @@ def run_stride_out_of_range():
 
         self.assertRaises(ValueError, run_stride_out_of_range)
 
-    def test_dygraph_final_state_api(self):
+    def test_dygraph_api(self):
         with _test_eager_guard():
             self.test_error_api()
 
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_api.py b/python/paddle/fluid/tests/unittests/test_pool3d_api.py
index cd874dfa13159..60199401041f8 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_api.py
@@ -334,7 +334,7 @@ def test_pool3d(self):
             self.check_max_dygraph_ndhwc_results(place)
             self.check_max_dygraph_ceilmode_results(place)
 
-    def test_dygraph_final_state_api(self):
+    def test_dygraph_api(self):
         with _test_eager_guard():
             self.test_pool3d()
 
@@ -504,7 +504,7 @@ def run_size_out_of_range():
 
         self.assertRaises(ValueError, run_size_out_of_range)
 
-    def test_dygraph_final_state_api(self):
+    def test_dygraph_api(self):
         with _test_eager_guard():
             self.test_error_api()
 
diff --git a/python/paddle/fluid/tests/unittests/test_reset_grad_inplace_version.py b/python/paddle/fluid/tests/unittests/test_reset_grad_inplace_version.py
index 839b0e331a89b..77a2508ae995a 100644
--- a/python/paddle/fluid/tests/unittests/test_reset_grad_inplace_version.py
+++ b/python/paddle/fluid/tests/unittests/test_reset_grad_inplace_version.py
@@ -14,7 +14,7 @@
 import numpy as np
 import paddle
 import paddle.fluid as fluid
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid import framework
 from paddle.fluid.framework import _test_eager_guard
 import unittest
@@ -28,7 +28,7 @@ def clear_grad_test_0(w, a):
     @paddle.no_grad()
     def warp(*_):
         assert w.grad is not None
-        _C_ops.scale_(w.grad, 'scale', 0.5)
+        _legacy_C_ops.scale_(w.grad, 'scale', 0.5)
         w._reset_grad_inplace_version(True)
 
     return warp
@@ -44,8 +44,9 @@ def func_test(self):
         w._register_backward_hook(_clear_grad)
         for i in range(2):
             print(" Step: ", i)
-            out0 = _C_ops.scale(w, 'scale', 0.1)
-            out = _C_ops.matmul_v2(out0, w, 'trans_x', False, 'trans_y', False)
+            out0 = _legacy_C_ops.scale(w, 'scale', 0.1)
+            out = _legacy_C_ops.matmul_v2(out0, w, 'trans_x', False, 'trans_y',
+                                          False)
             out.backward()
         assert w.grad[0] == 0.15
 
@@ -88,8 +89,9 @@ def func_test(self):
         w._register_backward_hook(_clear_grad)
 
         for c.step in range(5):
-            out0 = _C_ops.scale(w, 'scale', 0.1)
-            out = _C_ops.matmul_v2(out0, w, 'trans_x', False, 'trans_y', False)
+            out0 = _legacy_C_ops.scale(w, 'scale', 0.1)
+            out = _legacy_C_ops.matmul_v2(out0, w, 'trans_x', False, 'trans_y',
+                                          False)
 
             out.backward()
 
@@ -110,7 +112,7 @@ class TestInplaceClearGradAccumulationAlt(unittest.TestCase):
     def func_test(self):
         input_data = np.ones([1, 1])
         w = paddle.to_tensor(input_data, 'float32', stop_gradient=False)
-        out = _C_ops.scale(w, 'scale', 0.1)
+        out = _legacy_C_ops.scale(w, 'scale', 0.1)
         out.backward()
 
         w.grad.scale_(scale=0.5)
diff --git a/python/paddle/fluid/tests/unittests/test_reverse_op.py b/python/paddle/fluid/tests/unittests/test_reverse_op.py
index 60f2d0cb1ae71..e2260082fc968 100644
--- a/python/paddle/fluid/tests/unittests/test_reverse_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reverse_op.py
@@ -268,7 +268,7 @@ def test_api(self):
         x = paddle.randn([4, 10])
         y = paddle.randn([4, 10])
 
-        out = paddle._C_ops.final_state_reverse_array([x, y], [0])
+        out = paddle._C_ops.reverse_array([x, y], [0])
         np.testing.assert_allclose(x.numpy(), out[1].numpy())
         np.testing.assert_allclose(y.numpy(), out[0].numpy())
 
diff --git a/python/paddle/fluid/tests/unittests/test_run_program_op.py b/python/paddle/fluid/tests/unittests/test_run_program_op.py
index e1b4549cf0798..50fd3c01769f4 100644
--- a/python/paddle/fluid/tests/unittests/test_run_program_op.py
+++ b/python/paddle/fluid/tests/unittests/test_run_program_op.py
@@ -20,7 +20,7 @@
 import six
 
 import paddle
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 import paddle.fluid as fluid
 from paddle import compat as cpt
 from paddle.fluid import core, framework, executor
@@ -200,9 +200,9 @@ def calc_dygraph_output(self, place):
             inputs = self.prepare_dygraph_input(place)
             outputs = self.prepare_dygraph_output()
 
-            _C_ops.run_program(inputs['X'], inputs['Params'], outputs['Out'],
-                               outputs['OutScope'], outputs['DOut'], None,
-                               *self.attrs)
+            _legacy_C_ops.run_program(inputs['X'], inputs['Params'],
+                                      outputs['Out'], outputs['OutScope'],
+                                      outputs['DOut'], None, *self.attrs)
             return outputs['Out']
 
     def calc_dygraph_grad(self, place):
@@ -214,9 +214,9 @@ def calc_dygraph_grad(self, place):
             inputs, input_param_list = self.prepare_dygraph_input(place, True)
             outputs = self.prepare_dygraph_output()
 
-            _C_ops.run_program(inputs['X'], inputs['Params'], outputs['Out'],
-                               outputs['OutScope'], outputs['DOut'], None,
-                               *self.attrs)
+            _legacy_C_ops.run_program(inputs['X'], inputs['Params'],
+                                      outputs['Out'], outputs['OutScope'],
+                                      outputs['DOut'], None, *self.attrs)
 
             for param in input_param_list:
                 var_type = self._get_grad_vartype(param.name)
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
index d4cb658d96aa0..2f0938d2b6c00 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
@@ -71,9 +71,10 @@ def python_core_api_without_softmax(logits,
     # the API paddle.nn.functional.softmax_with_cross_entropy cannot
     # set use_softmax=False, so add a core api manually
     assert use_softmax is False
-    _, loss = paddle._C_ops.final_state_cross_entropy_with_softmax(
-        logits, label, soft_label, use_softmax, numeric_stable_mode,
-        ignore_index, axis)
+    _, loss = paddle._C_ops.cross_entropy_with_softmax(logits, label,
+                                                       soft_label, use_softmax,
+                                                       numeric_stable_mode,
+                                                       ignore_index, axis)
     return loss
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py b/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py
index 36ecfeccd1a1d..4477998875246 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py
@@ -16,7 +16,7 @@
 import unittest
 import numpy as np
 import paddle
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid import core
 from paddle.fluid.framework import _test_eager_guard
 
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_copy_op.py b/python/paddle/fluid/tests/unittests/test_sparse_copy_op.py
index f8bc93f27032b..84ff272659348 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_copy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_copy_op.py
@@ -16,7 +16,7 @@
 import unittest
 import numpy as np
 import paddle
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid import core
 from paddle.fluid.framework import _test_eager_guard
 
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_pooling_op.py b/python/paddle/fluid/tests/unittests/test_sparse_pooling_op.py
index 5f6d71008d785..0e6d8ddef9444 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_pooling_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_pooling_op.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.fluid.core as core
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid.framework import _test_eager_guard
 import copy
 
diff --git a/python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py b/python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py
index 8c4131d71d081..d76f4712dbc1c 100644
--- a/python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py
@@ -19,15 +19,15 @@
 from numpy import linalg as LA
 from op_test import OpTest
 import paddle
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.framework import in_dygraph_mode
 
 
 def test_squared_l2_norm(x):
     if in_dygraph_mode():
-        return _C_ops.final_state_squared_l2_norm(x)
-    else:
         return _C_ops.squared_l2_norm(x)
+    else:
+        return _legacy_C_ops.squared_l2_norm(x)
 
 
 class TestL2LossOp(OpTest):
@@ -60,8 +60,8 @@ def check_place(self, place):
         with paddle.fluid.dygraph.guard(place):
             x_np = np.random.rand(5, 11, 13).astype('float32')
             x = paddle.to_tensor(x_np)
-            y1 = _C_ops.squared_l2_norm(x)
-            y2 = _C_ops.squared_l2_norm(x)
+            y1 = _legacy_C_ops.squared_l2_norm(x)
+            y2 = _legacy_C_ops.squared_l2_norm(x)
             np.testing.assert_array_equal(y1.numpy(), y2.numpy())
 
     def test_main(self):
diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py
index a431f79af61ca..e327c335b0dd0 100644
--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_op.py
@@ -25,7 +25,7 @@
 from paddle.fluid.tests.unittests.op_test import (OpTest,
                                                   convert_float_to_uint16,
                                                   convert_uint16_to_float)
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid.framework import _test_eager_guard
 
 
@@ -358,7 +358,7 @@ def test_api(self):
 
             self.assertEqual((sum_value.numpy() == expected_result).all(), True)
 
-    def test_dygraph_final_state_api(self):
+    def test_dygraph_api(self):
         with fluid.dygraph.guard():
             with _test_eager_guard():
                 input0 = paddle.ones(shape=[2, 3], dtype='float32')
@@ -470,11 +470,11 @@ def test_errors(self):
 
         def test_empty_list_input():
             with fluid.dygraph.guard():
-                fluid._C_ops.sum([])
+                fluid._legacy_C_ops.sum([])
 
         def test_list_of_none_input():
             with fluid.dygraph.guard():
-                fluid._C_ops.sum([None])
+                fluid._legacy_C_ops.sum([None])
 
         self.assertRaises(Exception, test_empty_list_input)
         self.assertRaises(Exception, test_list_of_none_input)
diff --git a/python/paddle/fluid/tests/unittests/test_trilinear_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_trilinear_interp_v2_op.py
index 6aace168ec526..4255a718d42e1 100755
--- a/python/paddle/fluid/tests/unittests/test_trilinear_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_trilinear_interp_v2_op.py
@@ -49,11 +49,10 @@ def trilinear_interp_test(x,
         if not isinstance(SizeTensor, list) and not isinstance(
                 SizeTensor, tuple):
             SizeTensor = [SizeTensor]
-    return paddle._C_ops.final_state_trilinear_interp(x, OutSize, SizeTensor,
-                                                      Scale, data_layout, out_d,
-                                                      out_h, out_w, scale,
-                                                      interp_method,
-                                                      align_corners, align_mode)
+    return paddle._C_ops.trilinear_interp(x, OutSize, SizeTensor, Scale,
+                                          data_layout, out_d, out_h, out_w,
+                                          scale, interp_method, align_corners,
+                                          align_mode)
 
 
 def trilinear_interp_np(input,
diff --git a/python/paddle/fluid/tests/unittests/test_truncated_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_truncated_gaussian_random_op.py
index 8016499d9ac73..69cb49b537fdc 100644
--- a/python/paddle/fluid/tests/unittests/test_truncated_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_truncated_gaussian_random_op.py
@@ -75,7 +75,7 @@ def gaussian_random_test(self, place):
     def gaussian_random_test_eager(self, place):
         with fluid.dygraph.guard(place):
             with _test_eager_guard():
-                out = paddle._C_ops.final_state_truncated_gaussian_random(
+                out = paddle._C_ops.truncated_gaussian_random(
                     self.attrs["shape"], self.attrs["mean"], self.attrs["std"],
                     self.attrs["seed"], core.VarDesc.VarType.FP32, place)
                 self.assertAlmostEqual(numpy.mean(out.numpy()), .0, delta=0.1)
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
index 7aba65f9d1120..21248a1577167 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
@@ -178,7 +178,7 @@ def verify_output(self, outs):
         hist, prob = self.output_hist(np.array(outs[0]))
         np.testing.assert_allclose(hist, prob, rtol=0, atol=0.01)
 
-    def test_check_api(self):
+    def func_test_check_api(self):
         places = self._get_places()
         for place in places:
             with fluid.dygraph.base.guard(place=place):
@@ -188,7 +188,8 @@ def test_check_api(self):
 
     def test_check_api_eager(self):
         with _test_eager_guard():
-            self.test_check_api()
+            self.func_test_check_api()
+        self.func_test_check_api()
 
 
 class TestUniformRandomOpError(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_unique.py b/python/paddle/fluid/tests/unittests/test_unique.py
index 789d52505aed2..a914864fd54f7 100644
--- a/python/paddle/fluid/tests/unittests/test_unique.py
+++ b/python/paddle/fluid/tests/unittests/test_unique.py
@@ -257,7 +257,7 @@ def test_dygraph_attr_dtype(self):
         self.assertTrue((counts.numpy() == np_counts).all(), True)
         paddle.enable_static()
 
-    def test_dygraph_final_state_api(self):
+    def test_dygraph_api(self):
         with _test_eager_guard():
             self.test_dygraph_api_out()
             self.test_dygraph_api_attr()
diff --git a/python/paddle/fluid/tests/unittests/test_zeros_like_op.py b/python/paddle/fluid/tests/unittests/test_zeros_like_op.py
index fcd6bba051eac..d7394172dcc2c 100644
--- a/python/paddle/fluid/tests/unittests/test_zeros_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_zeros_like_op.py
@@ -18,7 +18,7 @@
 import paddle
 import paddle.fluid as fluid
 from paddle import zeros_like
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid import core, Program, program_guard
 from paddle.fluid.framework import _test_eager_guard
 from paddle.fluid.framework import convert_np_dtype_to_dtype_
@@ -97,9 +97,7 @@ def test_api(self):
         paddle.disable_static(place)
 
         for dtype in [np.float32, np.float64, np.int32, np.int64]:
-            out = _C_ops.final_state_zeros(shape,
-                                           convert_np_dtype_to_dtype_(dtype),
-                                           place)
+            out = _C_ops.zeros(shape, convert_np_dtype_to_dtype_(dtype), place)
             self.assertEqual((out.numpy() == np.zeros(shape, dtype)).all(),
                              True)
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py
index e6997367b2eb2..714b13391c345 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py
@@ -26,7 +26,7 @@
 from paddle.fluid.tests.unittests.op_test import (OpTest,
                                                   convert_float_to_uint16,
                                                   convert_uint16_to_float)
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 import op_test
 from op_test_xpu import XPUOpTest
 from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
@@ -205,11 +205,11 @@ def test_errors(self):
 
         def test_empty_list_input():
             with fluid.dygraph.guard():
-                fluid._C_ops.sum([])
+                fluid._legacy_C_ops.sum([])
 
         def test_list_of_none_input():
             with fluid.dygraph.guard():
-                fluid._C_ops.sum([None])
+                fluid._legacy_C_ops.sum([None])
 
         self.assertRaises(Exception, test_empty_list_input)
         self.assertRaises(Exception, test_list_of_none_input)
diff --git a/python/paddle/fluid/variable_index.py b/python/paddle/fluid/variable_index.py
index 9038e10658177..35a35b894c368 100644
--- a/python/paddle/fluid/variable_index.py
+++ b/python/paddle/fluid/variable_index.py
@@ -500,9 +500,8 @@ def _getitem_impl_(var, item):
                 end = inputs['EndsTensorList']
             else:
                 end = attrs['ends']
-            out = paddle._C_ops.final_state_slice(var, axes, st, end,
-                                                  attrs['infer_flags'],
-                                                  attrs['decrease_axis'])
+            out = paddle._C_ops.slice(var, axes, st, end, attrs['infer_flags'],
+                                      attrs['decrease_axis'])
         else:
             target_block = default_main_program().current_block()
 
diff --git a/python/paddle/geometric/message_passing/send_recv.py b/python/paddle/geometric/message_passing/send_recv.py
index de8fd3b005e29..db2e9e7b21402 100644
--- a/python/paddle/geometric/message_passing/send_recv.py
+++ b/python/paddle/geometric/message_passing/send_recv.py
@@ -17,7 +17,7 @@
 from paddle.fluid.framework import _non_static_mode, _in_legacy_dygraph, in_dygraph_mode
 from paddle.fluid.framework import Variable
 from paddle.fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 from .utils import convert_out_size_to_list, get_out_size_tensor_inputs, reshape_lhs_rhs
 
@@ -120,15 +120,15 @@ def send_u_recv(x,
 
     if _in_legacy_dygraph():
         out_size = convert_out_size_to_list(out_size)
-        out, tmp = _C_ops.graph_send_recv(x, src_index,
-                                          dst_index, None, 'reduce_op',
-                                          reduce_op.upper(), 'out_size',
-                                          out_size)
+        out, tmp = _legacy_C_ops.graph_send_recv(x, src_index, dst_index,
+                                                 None, 'reduce_op',
+                                                 reduce_op.upper(), 'out_size',
+                                                 out_size)
         return out
     if in_dygraph_mode():
         out_size = convert_out_size_to_list(out_size)
-        return _C_ops.final_state_graph_send_recv(x, src_index, dst_index,
-                                                  reduce_op.upper(), out_size)
+        return _C_ops.graph_send_recv(x, src_index, dst_index,
+                                      reduce_op.upper(), out_size)
 
     check_variable_and_dtype(
         x, "X", ("float32", "float64", "int32", "int64", "float16"),
@@ -288,18 +288,18 @@ def send_ue_recv(x,
 
     if _in_legacy_dygraph():
         out_size = convert_out_size_to_list(out_size)
-        out, tmp = _C_ops.graph_send_ue_recv(x, y, src_index, dst_index,
-                                             None, 'message_op',
-                                             message_op.upper(), 'reduce_op',
-                                             reduce_op.upper(), 'out_size',
-                                             out_size)
+        out, tmp = _legacy_C_ops.graph_send_ue_recv(x, y, src_index, dst_index,
+                                                    None, 'message_op',
+                                                    message_op.upper(),
+                                                    'reduce_op',
+                                                    reduce_op.upper(),
+                                                    'out_size', out_size)
         return out
     if in_dygraph_mode():
         out_size = convert_out_size_to_list(out_size)
-        return _C_ops.final_state_graph_send_ue_recv(x, y, src_index, dst_index,
-                                                     message_op.upper(),
-                                                     reduce_op.upper(),
-                                                     out_size)
+        return _C_ops.graph_send_ue_recv(x, y, src_index, dst_index,
+                                         message_op.upper(), reduce_op.upper(),
+                                         out_size)
 
     check_variable_and_dtype(
         x, "X", ("float32", "float64", "int32", "int64", "float16"),
@@ -419,12 +419,12 @@ def send_uv(x, y, src_index, dst_index, message_op="add", name=None):
         y = 1. / y
 
     if in_dygraph_mode():
-        return _C_ops.final_state_graph_send_uv(x, y, src_index, dst_index,
-                                                message_op.upper())
+        return _C_ops.graph_send_uv(x, y, src_index, dst_index,
+                                    message_op.upper())
     else:
         if _in_legacy_dygraph():
-            return _C_ops.graph_send_uv(x, y, src_index, dst_index,
-                                        "message_op", message_op.upper())
+            return _legacy_C_ops.graph_send_uv(x, y, src_index, dst_index,
+                                               "message_op", message_op.upper())
         else:
             helper = LayerHelper("send_uv", **locals())
             check_variable_and_dtype(
diff --git a/python/paddle/incubate/distributed/models/moe/moe_layer.py b/python/paddle/incubate/distributed/models/moe/moe_layer.py
index 28740917c13f8..bb308444a5822 100644
--- a/python/paddle/incubate/distributed/models/moe/moe_layer.py
+++ b/python/paddle/incubate/distributed/models/moe/moe_layer.py
@@ -80,9 +80,9 @@ def _all_gather(tensor, group=None, use_calc_stream=True):
         ring_id = 0 if group is None else group.id
         nranks = paddle.distributed.collective._get_global_group(
         ).nranks if group is None else group.nranks
-        return paddle._C_ops.c_allgather(tensor, 'use_calc_stream',
-                                         use_calc_stream, 'ring_id', ring_id,
-                                         'nranks', nranks)
+        return paddle._legacy_C_ops.c_allgather(tensor, 'use_calc_stream',
+                                                use_calc_stream, 'ring_id',
+                                                ring_id, 'nranks', nranks)
 
 
 class MoEScatter(PyLayer):
diff --git a/python/paddle/incubate/distributed/models/moe/utils.py b/python/paddle/incubate/distributed/models/moe/utils.py
index b195ffdb815e2..10203a0cd18aa 100644
--- a/python/paddle/incubate/distributed/models/moe/utils.py
+++ b/python/paddle/incubate/distributed/models/moe/utils.py
@@ -37,8 +37,9 @@ def _alltoall(in_tensor_list, group=None, use_calc_stream=True):
         return out
     else:
         ring_id = 0 if group is None else group.id
-        return paddle._C_ops.alltoall(in_tensor_list, 'use_calc_stream',
-                                      use_calc_stream, 'ring_id', ring_id)
+        return paddle._legacy_C_ops.alltoall(in_tensor_list, 'use_calc_stream',
+                                             use_calc_stream, 'ring_id',
+                                             ring_id)
 
 
 def count_by_gate(gate, num_expert, world_size, require_pos=True, group=None):
diff --git a/python/paddle/incubate/nn/functional/fused_matmul_bias.py b/python/paddle/incubate/nn/functional/fused_matmul_bias.py
index d963c5e1ade5f..58e51c5fa5e9a 100644
--- a/python/paddle/incubate/nn/functional/fused_matmul_bias.py
+++ b/python/paddle/incubate/nn/functional/fused_matmul_bias.py
@@ -15,7 +15,7 @@
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.framework import _non_static_mode
 from paddle.tensor.linalg import matmul
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 
 def fused_matmul_bias(x,
@@ -57,8 +57,9 @@ def fused_matmul_bias(x,
     if bias is None:
         return matmul(x, y, transpose_x, transpose_y, name)
     if _non_static_mode():
-        return _C_ops.fused_gemm_epilogue(x, y, bias, 'trans_x', transpose_x,
-                                          'trans_y', transpose_y)
+        return _legacy_C_ops.fused_gemm_epilogue(x, y, bias, 'trans_x',
+                                                 transpose_x, 'trans_y',
+                                                 transpose_y)
 
     helper = LayerHelper('fused_matmul_bias', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py
index 506a282171bbd..b1d759b6953c3 100644
--- a/python/paddle/incubate/nn/functional/fused_transformer.py
+++ b/python/paddle/incubate/nn/functional/fused_transformer.py
@@ -16,7 +16,7 @@
 from paddle.fluid.framework import _non_static_mode, default_main_program
 from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
 from paddle.fluid import core, dygraph_utils
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 __all__ = []
 
@@ -131,7 +131,7 @@ def fused_feedforward(x,
     if _non_static_mode():
         if default_main_program().random_seed != 0:
             seed = default_main_program().random_seed
-        out, _, _, _, _, _, _, _, _, _, _ = _C_ops.fused_feedforward(
+        out, _, _, _, _, _, _, _, _, _, _ = _legacy_C_ops.fused_feedforward(
             x, None, None, linear1_weight, linear1_bias, linear2_weight,
             linear2_bias, ln1_scale, ln1_bias, ln2_scale, ln2_bias,
             'pre_layer_norm', pre_layer_norm, 'ln1_epsilon', ln1_epsilon,
@@ -307,7 +307,7 @@ def fused_bias_dropout_residual_layer_norm(x,
     if _non_static_mode():
         if default_main_program().random_seed != 0:
             seed = default_main_program().random_seed
-        _, _, _, _, final_out = _C_ops.fused_bias_dropout_residual_layer_norm(
+        _, _, _, _, final_out = _legacy_C_ops.fused_bias_dropout_residual_layer_norm(
             x, residual, bias, ln_scale, ln_bias, 'dropout_rate', dropout_rate,
             'ln_epsilon', ln_epsilon, 'is_test', not training,
             'dropout_fix_seed', seed is not None, 'dropout_seed',
@@ -531,7 +531,7 @@ def fused_multi_head_attention(x,
             assert qkv_weight.shape[1] * qkv_weight.shape[2] == qkv_weight.shape[
                 3], "embed_dim must be divisible by num_heads."
 
-        _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, cache_kv_out, final_out = _C_ops.fused_attention(
+        _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, cache_kv_out, final_out = _legacy_C_ops.fused_attention(
             x, pre_ln_scale, pre_ln_bias, qkv_weight, qkv_bias, cache_kv,
             attn_mask, linear_weight, linear_bias, ln_scale, ln_bias,
             'pre_layer_norm', pre_layer_norm, 'epsilon', pre_ln_epsilon,
@@ -824,7 +824,7 @@ def fused_multi_transformer(x,
     mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
 
     if _non_static_mode():
-        cache_kv_out, final_out = _C_ops.fused_multi_transformer(
+        cache_kv_out, final_out = _legacy_C_ops.fused_multi_transformer(
             x, ln_scales, ln_biases, qkv_weights, qkv_biases, cache_kvs,
             time_step, attn_mask, linear_weights, linear_biases, ffn_ln_scales,
             ffn_ln_biases, ffn1_weights, ffn1_biases, ffn2_weights, ffn2_biases,
diff --git a/python/paddle/incubate/operators/graph_khop_sampler.py b/python/paddle/incubate/operators/graph_khop_sampler.py
index 89014a7ad59a8..58e0fdafab679 100644
--- a/python/paddle/incubate/operators/graph_khop_sampler.py
+++ b/python/paddle/incubate/operators/graph_khop_sampler.py
@@ -17,7 +17,7 @@
 from paddle.fluid.framework import _non_static_mode
 from paddle.fluid.data_feeder import check_variable_and_dtype
 from paddle.fluid import core
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 
 def graph_khop_sampler(row,
@@ -91,14 +91,14 @@ def graph_khop_sampler(row,
                 raise ValueError(f"`sorted_eid` should not be None "
                                  f"if return_eids is True.")
             edge_src, edge_dst, sample_index, reindex_nodes, edge_eids = \
-                _C_ops.graph_khop_sampler(row, sorted_eids,
+                _legacy_C_ops.graph_khop_sampler(row, sorted_eids,
                                               colptr, input_nodes,
                                               "sample_sizes", sample_sizes,
                                               "return_eids", True)
             return edge_src, edge_dst, sample_index, reindex_nodes, edge_eids
         else:
             edge_src, edge_dst, sample_index, reindex_nodes, _ = \
-                _C_ops.graph_khop_sampler(row, None,
+                _legacy_C_ops.graph_khop_sampler(row, None,
                                               colptr, input_nodes,
                                               "sample_sizes", sample_sizes,
                                               "return_eids", False)
diff --git a/python/paddle/incubate/operators/graph_reindex.py b/python/paddle/incubate/operators/graph_reindex.py
index 1c49d6af950d5..fd55752db4de8 100644
--- a/python/paddle/incubate/operators/graph_reindex.py
+++ b/python/paddle/incubate/operators/graph_reindex.py
@@ -17,7 +17,7 @@
 from paddle.fluid.framework import _non_static_mode
 from paddle.fluid.data_feeder import check_variable_and_dtype
 from paddle.fluid import core
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 
 def graph_reindex(x,
@@ -109,7 +109,7 @@ def graph_reindex(x,
 
     if _non_static_mode():
         reindex_src, reindex_dst, out_nodes = \
-            _C_ops.graph_reindex(x, neighbors, count, value_buffer, index_buffer,
+            _legacy_C_ops.graph_reindex(x, neighbors, count, value_buffer, index_buffer,
                                  "flag_buffer_hashtable", flag_buffer_hashtable)
         return reindex_src, reindex_dst, out_nodes
 
diff --git a/python/paddle/incubate/operators/graph_sample_neighbors.py b/python/paddle/incubate/operators/graph_sample_neighbors.py
index 63424b395c703..3e385de7814ec 100644
--- a/python/paddle/incubate/operators/graph_sample_neighbors.py
+++ b/python/paddle/incubate/operators/graph_sample_neighbors.py
@@ -17,7 +17,7 @@
 from paddle.fluid.framework import _non_static_mode
 from paddle.fluid.data_feeder import check_variable_and_dtype
 from paddle.fluid import core
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 
 def graph_sample_neighbors(row,
@@ -101,7 +101,7 @@ def graph_sample_neighbors(row,
                 "is True.")
 
     if _non_static_mode():
-        out_neighbors, out_count, out_eids = _C_ops.graph_sample_neighbors(
+        out_neighbors, out_count, out_eids = _legacy_C_ops.graph_sample_neighbors(
             row, colptr, input_nodes, eids, perm_buffer, "sample_size",
             sample_size, "return_eids", return_eids, "flag_perm_buffer",
             flag_perm_buffer)
diff --git a/python/paddle/incubate/operators/graph_send_recv.py b/python/paddle/incubate/operators/graph_send_recv.py
index 4181885d419af..b8b01f9aad2e6 100644
--- a/python/paddle/incubate/operators/graph_send_recv.py
+++ b/python/paddle/incubate/operators/graph_send_recv.py
@@ -18,7 +18,7 @@
 from paddle.fluid.framework import Variable
 from paddle.fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
 from paddle.fluid.layers.tensor import cast
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 import paddle.utils.deprecated as deprecated
 
 
@@ -122,15 +122,15 @@ def graph_send_recv(x,
 
     if _in_legacy_dygraph():
         out_size = convert_out_size_to_list(out_size)
-        out, tmp = _C_ops.graph_send_recv(x, src_index,
-                                          dst_index, None, 'reduce_op',
-                                          pool_type.upper(), 'out_size',
-                                          out_size)
+        out, tmp = _legacy_C_ops.graph_send_recv(x, src_index, dst_index,
+                                                 None, 'reduce_op',
+                                                 pool_type.upper(), 'out_size',
+                                                 out_size)
         return out
     if in_dygraph_mode():
         out_size = convert_out_size_to_list(out_size)
-        return _C_ops.final_state_graph_send_recv(x, src_index, dst_index,
-                                                  pool_type.upper(), out_size)
+        return _C_ops.graph_send_recv(x, src_index, dst_index,
+                                      pool_type.upper(), out_size)
 
     check_variable_and_dtype(x, "X", ("float32", "float64", "int32", "int64"),
                              "graph_send_recv")
diff --git a/python/paddle/incubate/operators/resnet_unit.py b/python/paddle/incubate/operators/resnet_unit.py
index 70abe41f62462..2f4cb4d3bde90 100644
--- a/python/paddle/incubate/operators/resnet_unit.py
+++ b/python/paddle/incubate/operators/resnet_unit.py
@@ -33,7 +33,7 @@
 from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as
 from paddle.fluid.data_feeder import convert_dtype
 from paddle.fluid.param_attr import ParamAttr
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 
 def resnet_unit(x, filter_x, scale_x, bias_x, mean_x, var_x, z, filter_z,
diff --git a/python/paddle/incubate/operators/softmax_mask_fuse.py b/python/paddle/incubate/operators/softmax_mask_fuse.py
index 1b70dfce6d0f1..b02903d87fe6a 100644
--- a/python/paddle/incubate/operators/softmax_mask_fuse.py
+++ b/python/paddle/incubate/operators/softmax_mask_fuse.py
@@ -17,7 +17,7 @@
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.framework import _non_static_mode
 from paddle.fluid import core
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 
 def softmax_mask_fuse(x, mask, name=None):
@@ -59,7 +59,7 @@ def softmax_mask_fuse(x, mask, name=None):
             # [[[[0.02404429, 0.04658398, 0.02746007, ..., 0.01489375, 0.02397441, 0.02851614] ... ]]]
     """
     if _non_static_mode():
-        out = _C_ops.fused_softmax_mask(x, mask)
+        out = _legacy_C_ops.fused_softmax_mask(x, mask)
         return out
     helper = LayerHelper('fused_softmax_mask', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
diff --git a/python/paddle/incubate/operators/softmax_mask_fuse_upper_triangle.py b/python/paddle/incubate/operators/softmax_mask_fuse_upper_triangle.py
index dda5981f5adba..e287df1e710c4 100644
--- a/python/paddle/incubate/operators/softmax_mask_fuse_upper_triangle.py
+++ b/python/paddle/incubate/operators/softmax_mask_fuse_upper_triangle.py
@@ -17,7 +17,7 @@
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.framework import _non_static_mode
 from paddle.fluid import core
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 
 def softmax_mask_fuse_upper_triangle(x):
@@ -59,7 +59,7 @@ def softmax_mask_fuse_upper_triangle(x):
             #     ... ]]]
     """
     if _non_static_mode():
-        out = _C_ops.fused_softmax_mask_upper_triangle(x)
+        out = _legacy_C_ops.fused_softmax_mask_upper_triangle(x)
         return out
 
     helper = LayerHelper('fused_softmax_mask_upper_triangle', **locals())
diff --git a/python/paddle/incubate/optimizer/modelaverage.py b/python/paddle/incubate/optimizer/modelaverage.py
index b7d499f77292e..67be022c288f2 100644
--- a/python/paddle/incubate/optimizer/modelaverage.py
+++ b/python/paddle/incubate/optimizer/modelaverage.py
@@ -20,7 +20,7 @@
 import numpy as np
 from paddle.fluid.dygraph import base as imperative_base
 from paddle.fluid.wrapped_decorator import signature_safe_contextmanager
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid.framework import in_dygraph_mode
 
 __all__ = []
@@ -234,13 +234,13 @@ def _append_optimize_op(self, block, param_and_grad):
         num_updates = self._get_accumulator('num_updates', param_and_grad[0])
 
         if in_dygraph_mode():
-            _, _, _, _, _, _ = _C_ops.final_state_average_accumulates_(
+            _, _, _, _, _, _ = _C_ops.average_accumulates_(
                 param_and_grad[0], sum_1, sum_2, sum_3, num_accumulates,
                 old_num_accumulates, num_updates, self.average_window,
                 self.max_average_window, self.min_average_window)
             return None
         elif framework._non_static_mode():
-            _, _, _, _, _, _ = _C_ops.average_accumulates(
+            _, _, _, _, _, _ = _legacy_C_ops.average_accumulates(
                 param_and_grad[0], sum_1, sum_2, sum_3, num_accumulates,
                 old_num_accumulates, num_updates, sum_1, sum_2, sum_3,
                 num_accumulates, old_num_accumulates, num_updates,
diff --git a/python/paddle/incubate/sparse/binary.py b/python/paddle/incubate/sparse/binary.py
index 7a7861f7b20e7..93ce90c9f021a 100644
--- a/python/paddle/incubate/sparse/binary.py
+++ b/python/paddle/incubate/sparse/binary.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid.framework import dygraph_only, core
 
 __all__ = []
@@ -94,7 +94,7 @@ def matmul(x, y, name=None):
             #         [2., 2.],
             #         [3., 3.]])
     """
-    return _C_ops.final_state_sparse_matmul(x, y)
+    return _C_ops.sparse_matmul(x, y)
 
 
 @dygraph_only
@@ -154,7 +154,7 @@ def masked_matmul(x, y, mask, name=None):
             #        values=[0.98986477, 0.97800624, 1.14591956, 0.68561077, 0.94714981])
 
     """
-    return _C_ops.final_state_sparse_masked_matmul(x, y, mask)
+    return _C_ops.sparse_masked_matmul(x, y, mask)
 
 
 @dygraph_only
@@ -210,7 +210,7 @@ def mv(x, vec, name=None):
                 #        [-3.85499096, -2.42975140, -1.75087738])
 
     """
-    return _C_ops.final_state_sparse_mv(x, vec)
+    return _C_ops.sparse_mv(x, vec)
 
 
 def add(x, y, name=None):
@@ -253,8 +253,8 @@ def add(x, y, name=None):
 
     """
     if y.dtype != x.dtype:
-        y = _C_ops.final_state_sparse_cast(y, None, x.dtype)
-    return _C_ops.final_state_sparse_add(x, y)
+        y = _C_ops.sparse_cast(y, None, x.dtype)
+    return _C_ops.sparse_add(x, y)
 
 
 @dygraph_only
@@ -298,8 +298,8 @@ def subtract(x, y, name=None):
 
     """
     if y.dtype != x.dtype:
-        y = _C_ops.final_state_sparse_cast(y, None, x.dtype)
-    return _C_ops.final_state_sparse_subtract(x, y)
+        y = _C_ops.sparse_cast(y, None, x.dtype)
+    return _C_ops.sparse_subtract(x, y)
 
 
 @dygraph_only
@@ -343,11 +343,11 @@ def multiply(x, y, name=None):
 
     """
     if isinstance(y, (int, float)):
-        return _C_ops.final_state_sparse_scale(x, float(y), 0.0, True)
+        return _C_ops.sparse_scale(x, float(y), 0.0, True)
     else:
         if y.dtype != x.dtype:
-            y = _C_ops.final_state_sparse_cast(y, None, x.dtype)
-        return _C_ops.final_state_sparse_multiply(x, y)
+            y = _C_ops.sparse_cast(y, None, x.dtype)
+        return _C_ops.sparse_multiply(x, y)
 
 
 @dygraph_only
@@ -391,11 +391,11 @@ def divide(x, y, name=None):
 
     """
     if x.dtype in _int_dtype_:
-        x = _C_ops.final_state_sparse_cast(x, None, core.VarDesc.VarType.FP32)
+        x = _C_ops.sparse_cast(x, None, core.VarDesc.VarType.FP32)
 
     if isinstance(y, (int, float)):
-        return _C_ops.final_state_sparse_divide_scalar(x, float(y))
+        return _C_ops.sparse_divide_scalar(x, float(y))
     else:
         if y.dtype != x.dtype:
-            y = _C_ops.final_state_sparse_cast(y, None, x.dtype)
-        return _C_ops.final_state_sparse_divide(x, y)
+            y = _C_ops.sparse_cast(y, None, x.dtype)
+        return _C_ops.sparse_divide(x, y)
diff --git a/python/paddle/incubate/sparse/creation.py b/python/paddle/incubate/sparse/creation.py
index af5b84139f788..143dbd770814c 100644
--- a/python/paddle/incubate/sparse/creation.py
+++ b/python/paddle/incubate/sparse/creation.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import paddle
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid.framework import core, dygraph_only
 from paddle.fluid.framework import _current_expected_place, _get_paddle_place
 from paddle.tensor import to_tensor, max
@@ -166,8 +166,7 @@ def sparse_coo_tensor(indices,
                 "the number of dimensions(len(shape) must be sparse_dim({}) + dense_dim({}), but get {}"
                 .format(sparse_dim, dense_dim, len(shape)))
 
-    return _C_ops.final_state_sparse_create_sparse_coo_tensor(
-        values, indices, shape)
+    return _C_ops.sparse_create_sparse_coo_tensor(values, indices, shape)
 
 
 #TODO: need to support shape is None
diff --git a/python/paddle/incubate/sparse/multiary.py b/python/paddle/incubate/sparse/multiary.py
index 17cf75fdc3903..d65847f138306 100644
--- a/python/paddle/incubate/sparse/multiary.py
+++ b/python/paddle/incubate/sparse/multiary.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid.framework import dygraph_only
 
 __all__ = []
@@ -78,4 +78,4 @@ def addmm(input, x, y, beta=1.0, alpha=1.0, name=None):
             out = paddle.incubate.sparse.addmm(input, x, y, 3.0, 2.0)
             
     """
-    return _C_ops.final_state_sparse_addmm(input, x, y, alpha, beta)
+    return _C_ops.sparse_addmm(input, x, y, alpha, beta)
diff --git a/python/paddle/incubate/sparse/nn/functional/activation.py b/python/paddle/incubate/sparse/nn/functional/activation.py
index 2305abc8d530e..ddaa6ada01be1 100644
--- a/python/paddle/incubate/sparse/nn/functional/activation.py
+++ b/python/paddle/incubate/sparse/nn/functional/activation.py
@@ -14,7 +14,7 @@
 
 __all__ = []
 
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid.framework import dygraph_only
 
 
@@ -45,7 +45,7 @@ def relu(x, name=None):
             out = paddle.incubate.sparse.nn.functional.relu(sparse_x)
             # [0., 0., 1.]
     """
-    return _C_ops.final_state_sparse_relu(x)
+    return _C_ops.sparse_relu(x)
 
 
 @dygraph_only
@@ -101,7 +101,7 @@ def softmax(x, axis=-1, name=None):
             #                1.        ])
     
     """
-    return _C_ops.final_state_sparse_softmax(x, axis)
+    return _C_ops.sparse_softmax(x, axis)
 
 
 @dygraph_only
@@ -130,7 +130,7 @@ def relu6(x, name=None):
             sparse_x = dense_x.to_sparse_coo(1)
             out = paddle.incubate.sparse.nn.functional.relu6(sparse_x)
     """
-    return _C_ops.final_state_sparse_relu6(x, 6.0)
+    return _C_ops.sparse_relu6(x, 6.0)
 
 
 @dygraph_only
@@ -166,4 +166,4 @@ def leaky_relu(x, negative_slope=0.01, name=None):
             sparse_x = dense_x.to_sparse_coo(1)
             out = paddle.incubate.sparse.nn.functional.leaky_relu(sparse_x, 0.5)
     """
-    return _C_ops.final_state_sparse_leaky_relu(x, negative_slope)
+    return _C_ops.sparse_leaky_relu(x, negative_slope)
diff --git a/python/paddle/incubate/sparse/nn/functional/conv.py b/python/paddle/incubate/sparse/nn/functional/conv.py
index 605cadc2b091e..cd3e8e3551f5b 100644
--- a/python/paddle/incubate/sparse/nn/functional/conv.py
+++ b/python/paddle/incubate/sparse/nn/functional/conv.py
@@ -14,7 +14,7 @@
 
 __all__ = []
 
-from paddle import _C_ops, in_dynamic_mode
+from paddle import _C_ops, _legacy_C_ops, in_dynamic_mode
 from paddle.fluid.layers.utils import convert_to_list
 from paddle.fluid.layers.nn import elementwise_add
 from ...creation import sparse_coo_tensor
@@ -63,9 +63,9 @@ def _conv3d(x,
     dilation = convert_to_list(dilation, dims, 'dilation')
     op_type = "conv3d"
 
-    pre_bias = _C_ops.final_state_sparse_conv3d(x, weight, padding, dilation,
-                                                stride, groups, subm,
-                                                key if key is not None else "")
+    pre_bias = _C_ops.sparse_conv3d(x, weight, padding, dilation, stride,
+                                    groups, subm,
+                                    key if key is not None else "")
     if bias is not None:
         values = pre_bias.values()
         add_bias = elementwise_add(values, bias, axis=1)
diff --git a/python/paddle/incubate/sparse/nn/functional/pooling.py b/python/paddle/incubate/sparse/nn/functional/pooling.py
index 8ed4444e89c26..cae93553b175a 100644
--- a/python/paddle/incubate/sparse/nn/functional/pooling.py
+++ b/python/paddle/incubate/sparse/nn/functional/pooling.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from paddle.fluid.layers import utils
-from paddle import _C_ops, in_dynamic_mode
+from paddle import _C_ops, _legacy_C_ops, in_dynamic_mode
 from paddle.nn.functional.pooling import _update_padding_nd
 
 __all__ = []
@@ -95,5 +95,4 @@ def max_pool3d(x,
     #TODO(zkh2016): remove the dependency on dilation from the backend
     dilation = [1, 1, 1]
 
-    return _C_ops.final_state_sparse_maxpool(x, kernel_size, padding, dilation,
-                                             stride)
+    return _C_ops.sparse_maxpool(x, kernel_size, padding, dilation, stride)
diff --git a/python/paddle/incubate/sparse/nn/functional/transformer.py b/python/paddle/incubate/sparse/nn/functional/transformer.py
index f69714700bf5d..a4c9faf9ad57a 100644
--- a/python/paddle/incubate/sparse/nn/functional/transformer.py
+++ b/python/paddle/incubate/sparse/nn/functional/transformer.py
@@ -14,7 +14,7 @@
 
 __all__ = []
 
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid.framework import dygraph_only
 
 
@@ -88,7 +88,5 @@ def attention(query,
             output = paddle.incubate.sparse.nn.functional.attention(query, key, value, sp_mask, kp_mask, attn_mask)
             output.backward()
     """
-    return _C_ops.final_state_sparse_fused_attention(query, key, value,
-                                                     sparse_mask,
-                                                     key_padding_mask,
-                                                     attn_mask)
+    return _C_ops.sparse_fused_attention(query, key, value, sparse_mask,
+                                         key_padding_mask, attn_mask)
diff --git a/python/paddle/incubate/sparse/unary.py b/python/paddle/incubate/sparse/unary.py
index ae55a5b9abcbc..472a71d482b81 100644
--- a/python/paddle/incubate/sparse/unary.py
+++ b/python/paddle/incubate/sparse/unary.py
@@ -14,7 +14,7 @@
 
 import numpy as np
 
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid.framework import dygraph_only, core, convert_np_dtype_to_dtype_
 
 __all__ = []
@@ -56,7 +56,7 @@ def sin(x, name=None):
             out = paddle.incubate.sparse.sin(sparse_x)
             
     """
-    return _C_ops.final_state_sparse_sin(x)
+    return _C_ops.sparse_sin(x)
 
 
 @dygraph_only
@@ -86,7 +86,7 @@ def tan(x, name=None):
             out = paddle.incubate.sparse.tan(sparse_x)
             
     """
-    return _C_ops.final_state_sparse_tan(x)
+    return _C_ops.sparse_tan(x)
 
 
 @dygraph_only
@@ -116,7 +116,7 @@ def asin(x, name=None):
             out = paddle.incubate.sparse.asin(sparse_x)
             
     """
-    return _C_ops.final_state_sparse_asin(x)
+    return _C_ops.sparse_asin(x)
 
 
 @dygraph_only
@@ -146,7 +146,7 @@ def atan(x, name=None):
             out = paddle.incubate.sparse.atan(sparse_x)
             
     """
-    return _C_ops.final_state_sparse_atan(x)
+    return _C_ops.sparse_atan(x)
 
 
 @dygraph_only
@@ -176,7 +176,7 @@ def sinh(x, name=None):
             out = paddle.incubate.sparse.sinh(sparse_x)
             
     """
-    return _C_ops.final_state_sparse_sinh(x)
+    return _C_ops.sparse_sinh(x)
 
 
 @dygraph_only
@@ -206,7 +206,7 @@ def asinh(x, name=None):
             out = paddle.incubate.sparse.asinh(sparse_x)
             
     """
-    return _C_ops.final_state_sparse_asinh(x)
+    return _C_ops.sparse_asinh(x)
 
 
 @dygraph_only
@@ -236,7 +236,7 @@ def atanh(x, name=None):
             out = paddle.incubate.sparse.atanh(sparse_x)
             
     """
-    return _C_ops.final_state_sparse_atanh(x)
+    return _C_ops.sparse_atanh(x)
 
 
 @dygraph_only
@@ -266,7 +266,7 @@ def tanh(x, name=None):
             out = paddle.incubate.sparse.tanh(sparse_x)
             
     """
-    return _C_ops.final_state_sparse_tanh(x)
+    return _C_ops.sparse_tanh(x)
 
 
 @dygraph_only
@@ -296,7 +296,7 @@ def square(x, name=None):
             out = paddle.incubate.sparse.square(sparse_x)
             
     """
-    return _C_ops.final_state_sparse_square(x)
+    return _C_ops.sparse_square(x)
 
 
 @dygraph_only
@@ -326,7 +326,7 @@ def sqrt(x, name=None):
             out = paddle.incubate.sparse.sqrt(sparse_x)
             
     """
-    return _C_ops.final_state_sparse_sqrt(x)
+    return _C_ops.sparse_sqrt(x)
 
 
 @dygraph_only
@@ -356,7 +356,7 @@ def log1p(x, name=None):
             out = paddle.incubate.sparse.log1p(sparse_x)
             
     """
-    return _C_ops.final_state_sparse_log1p(x)
+    return _C_ops.sparse_log1p(x)
 
 
 @dygraph_only
@@ -391,7 +391,7 @@ def cast(x, index_dtype=None, value_dtype=None, name=None):
         index_dtype = convert_np_dtype_to_dtype_(index_dtype)
     if value_dtype and not isinstance(value_dtype, core.VarDesc.VarType):
         value_dtype = convert_np_dtype_to_dtype_(value_dtype)
-    return _C_ops.final_state_sparse_cast(x, index_dtype, value_dtype)
+    return _C_ops.sparse_cast(x, index_dtype, value_dtype)
 
 
 @dygraph_only
@@ -422,7 +422,7 @@ def pow(x, factor, name=None):
             out = paddle.incubate.sparse.pow(sparse_x, 2)
             
     """
-    return _C_ops.final_state_sparse_pow(x, float(factor))
+    return _C_ops.sparse_pow(x, float(factor))
 
 
 @dygraph_only
@@ -452,7 +452,7 @@ def neg(x, name=None):
             out = paddle.incubate.sparse.neg(sparse_x)
             
     """
-    return _C_ops.final_state_sparse_scale(x, -1.0, 0.0, True)
+    return _C_ops.sparse_scale(x, -1.0, 0.0, True)
 
 
 @dygraph_only
@@ -482,7 +482,7 @@ def abs(x, name=None):
             out = paddle.incubate.sparse.abs(sparse_x)
             
     """
-    return _C_ops.final_state_sparse_abs(x)
+    return _C_ops.sparse_abs(x)
 
 
 @dygraph_only
@@ -512,7 +512,7 @@ def coalesce(x):
             print(sp_x.values())
             #[3.0, 3.0]
 	"""
-    return _C_ops.final_state_sparse_coalesce(x)
+    return _C_ops.sparse_coalesce(x)
 
 
 @dygraph_only
@@ -544,8 +544,8 @@ def rad2deg(x, name=None):
             
     """
     if x.dtype in _int_dtype_:
-        x = _C_ops.final_state_sparse_cast(x, None, core.VarDesc.VarType.FP32)
-    return _C_ops.final_state_sparse_scale(x, 180.0 / np.pi, 0.0, True)
+        x = _C_ops.sparse_cast(x, None, core.VarDesc.VarType.FP32)
+    return _C_ops.sparse_scale(x, 180.0 / np.pi, 0.0, True)
 
 
 @dygraph_only
@@ -577,8 +577,8 @@ def deg2rad(x, name=None):
             
     """
     if x.dtype in _int_dtype_:
-        x = _C_ops.final_state_sparse_cast(x, None, core.VarDesc.VarType.FP32)
-    return _C_ops.final_state_sparse_scale(x, np.pi / 180.0, 0.0, True)
+        x = _C_ops.sparse_cast(x, None, core.VarDesc.VarType.FP32)
+    return _C_ops.sparse_scale(x, np.pi / 180.0, 0.0, True)
 
 
 @dygraph_only
@@ -607,4 +607,4 @@ def expm1(x, name=None):
             sparse_x = dense_x.to_sparse_coo(1)
             out = paddle.incubate.sparse.expm1(sparse_x)
     """
-    return _C_ops.final_state_sparse_expm1(x)
+    return _C_ops.sparse_expm1(x)
diff --git a/python/paddle/incubate/tensor/math.py b/python/paddle/incubate/tensor/math.py
index 7ce2e735b6f11..5b2a9b3c66f16 100644
--- a/python/paddle/incubate/tensor/math.py
+++ b/python/paddle/incubate/tensor/math.py
@@ -14,7 +14,7 @@
 
 from paddle.fluid.layer_helper import LayerHelper, _non_static_mode
 from paddle.fluid.data_feeder import check_variable_and_dtype
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid.framework import _in_legacy_dygraph, in_dygraph_mode
 
 __all__ = []
@@ -52,9 +52,10 @@ def segment_sum(data, segment_ids, name=None):
 
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_segment_pool(data, segment_ids, "SUM")[0]
+        return _C_ops.segment_pool(data, segment_ids, "SUM")[0]
     if _in_legacy_dygraph():
-        out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "SUM")
+        out, tmp = _legacy_C_ops.segment_pool(data, segment_ids, 'pooltype',
+                                              "SUM")
         return out
 
     check_variable_and_dtype(data, "X",
@@ -113,9 +114,10 @@ def segment_mean(data, segment_ids, name=None):
     """
 
     if in_dygraph_mode():
-        return _C_ops.final_state_segment_pool(data, segment_ids, "MEAN")[0]
+        return _C_ops.segment_pool(data, segment_ids, "MEAN")[0]
     if _non_static_mode():
-        out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MEAN")
+        out, tmp = _legacy_C_ops.segment_pool(data, segment_ids, 'pooltype',
+                                              "MEAN")
         return out
 
     check_variable_and_dtype(data, "X",
@@ -173,10 +175,11 @@ def segment_min(data, segment_ids, name=None):
     """
 
     if in_dygraph_mode():
-        return _C_ops.final_state_segment_pool(data, segment_ids, "MIN")[0]
+        return _C_ops.segment_pool(data, segment_ids, "MIN")[0]
 
     if _non_static_mode():
-        out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MIN")
+        out, tmp = _legacy_C_ops.segment_pool(data, segment_ids, 'pooltype',
+                                              "MIN")
         return out
 
     check_variable_and_dtype(data, "X",
@@ -234,11 +237,12 @@ def segment_max(data, segment_ids, name=None):
     """
 
     if in_dygraph_mode():
-        out, tmp = _C_ops.final_state_segment_pool(data, segment_ids, "MAX")
+        out, tmp = _C_ops.segment_pool(data, segment_ids, "MAX")
         return out
 
     if _non_static_mode():
-        out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MAX")
+        out, tmp = _legacy_C_ops.segment_pool(data, segment_ids, 'pooltype',
+                                              "MAX")
         return out
 
     check_variable_and_dtype(data, "X",
diff --git a/python/paddle/incubate/xpu/resnet_block.py b/python/paddle/incubate/xpu/resnet_block.py
index 2b690cd7bf929..39b439730759c 100644
--- a/python/paddle/incubate/xpu/resnet_block.py
+++ b/python/paddle/incubate/xpu/resnet_block.py
@@ -31,7 +31,7 @@
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.data_feeder import convert_dtype
 from paddle.fluid.param_attr import ParamAttr
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 __all__ = ['resnet_basic_block', 'ResNetBasicBlock']
 
diff --git a/python/paddle/metric/metrics.py b/python/paddle/metric/metrics.py
index 919daa31d06fc..84b18b69e8659 100644
--- a/python/paddle/metric/metrics.py
+++ b/python/paddle/metric/metrics.py
@@ -24,7 +24,7 @@
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.framework import core, _varbase_creator, _non_static_mode, _in_legacy_dygraph
 import paddle
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 __all__ = []
 
@@ -805,8 +805,8 @@ def accuracy(input, label, k=1, correct=None, total=None, name=None):
             total = _varbase_creator(dtype="int32")
 
         topk_out, topk_indices = paddle.topk(input, k=k)
-        _acc, _, _ = _C_ops.accuracy(topk_out, topk_indices, label, correct,
-                                     total)
+        _acc, _, _ = _legacy_C_ops.accuracy(topk_out, topk_indices, label,
+                                            correct, total)
 
         return _acc
 
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index f6898347de2b0..f1a0b50e1d34c 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -26,7 +26,7 @@
 from ...fluid.framework import _in_legacy_dygraph, in_dygraph_mode, _non_static_mode
 from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
 import paddle
-from paddle import _C_ops, in_dynamic_mode
+from paddle import _C_ops, _legacy_C_ops, in_dynamic_mode
 from paddle.framework import core
 from paddle.fluid.framework import _in_legacy_dygraph, in_dygraph_mode
 
@@ -64,9 +64,9 @@ def celu(x, alpha=1.0, name=None):
         raise ZeroDivisionError("alpha cannot be 0 for celu")
 
     if _in_legacy_dygraph():
-        return _C_ops.celu(x, 'alpha', alpha)
+        return _legacy_C_ops.celu(x, 'alpha', alpha)
     if in_dygraph_mode():
-        return _C_ops.final_state_celu(x, alpha)
+        return _C_ops.celu(x, alpha)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'celu')
     helper = LayerHelper("celu", **locals())
@@ -114,10 +114,10 @@ def elu(x, alpha=1.0, name=None):
     """
 
     if in_dygraph_mode():
-        return _C_ops.final_state_elu(x, alpha)
+        return _C_ops.elu(x, alpha)
 
     if _in_legacy_dygraph():
-        return _C_ops.elu(x, 'alpha', alpha)
+        return _legacy_C_ops.elu(x, 'alpha', alpha)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'elu')
     helper = LayerHelper("elu", **locals())
@@ -137,8 +137,8 @@ def elu_(x, alpha=1.0, name=None):
     """
     assert alpha >= 0., "elu_ only support alpha >= 0, please use elu instead."
     if in_dygraph_mode():
-        return _C_ops.final_state_elu_(x, alpha)
-    return _C_ops.elu_(x, 'alpha', alpha)
+        return _C_ops.elu_(x, alpha)
+    return _legacy_C_ops.elu_(x, 'alpha', alpha)
 
 
 def gelu(x, approximate=False, name=None):
@@ -182,10 +182,10 @@ def gelu(x, approximate=False, name=None):
     """
 
     if in_dygraph_mode():
-        return _C_ops.final_state_gelu(x, approximate)
+        return _C_ops.gelu(x, approximate)
 
     if _in_legacy_dygraph():
-        return _C_ops.gelu(x, 'approximate', approximate)
+        return _legacy_C_ops.gelu(x, 'approximate', approximate)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'gelu')
     helper = LayerHelper("gelu", **locals())
@@ -232,10 +232,10 @@ def hardshrink(x, threshold=0.5, name=None):
 
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_hard_shrink(x, threshold)
+        return _C_ops.hard_shrink(x, threshold)
 
     if _in_legacy_dygraph():
-        return _C_ops.hard_shrink(x, 'threshold', threshold)
+        return _legacy_C_ops.hard_shrink(x, 'threshold', threshold)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
                              'hardshrink')
@@ -285,10 +285,10 @@ def hardtanh(x, min=-1.0, max=1.0, name=None):
     """
 
     if in_dygraph_mode():
-        return _C_ops.final_state_brelu(x, min, max)
+        return _C_ops.brelu(x, min, max)
 
     if _in_legacy_dygraph():
-        return _C_ops.brelu(x, 't_min', min, 't_max', max)
+        return _legacy_C_ops.brelu(x, 't_min', min, 't_max', max)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
                              'hardtanh')
@@ -344,10 +344,10 @@ def hardsigmoid(x, slope=0.1666667, offset=0.5, name=None):
     """
 
     if in_dygraph_mode():
-        return _C_ops.final_state_hard_sigmoid(x, slope, offset)
+        return _C_ops.hard_sigmoid(x, slope, offset)
 
     if _in_legacy_dygraph():
-        return _C_ops.hard_sigmoid(x, 'slope', slope, 'offset', offset)
+        return _legacy_C_ops.hard_sigmoid(x, 'slope', slope, 'offset', offset)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
                              'hardsigmoid')
@@ -402,9 +402,9 @@ def hardswish(x, name=None):
     """
 
     if _in_legacy_dygraph():
-        return _C_ops.hard_swish(x)
+        return _legacy_C_ops.hard_swish(x)
     if in_dygraph_mode():
-        return _C_ops.final_state_hard_swish(x, 6, 6, 3)
+        return _C_ops.hard_swish(x, 6, 6, 3)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
                              'hardswish')
@@ -449,10 +449,10 @@ def leaky_relu(x, negative_slope=0.01, name=None):
 
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_leaky_relu(x, negative_slope)
+        return _C_ops.leaky_relu(x, negative_slope)
 
     if _in_legacy_dygraph():
-        return _C_ops.leaky_relu(x, 'alpha', negative_slope)
+        return _legacy_C_ops.leaky_relu(x, 'alpha', negative_slope)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
                              'leaky_relu')
@@ -542,9 +542,10 @@ def prelu(x, weight, data_format="NCHW", name=None):
         mode = 'channel'
 
     if in_dygraph_mode():
-        return _C_ops.final_state_prelu(x, weight, data_format, mode)
+        return _C_ops.prelu(x, weight, data_format, mode)
     if _in_legacy_dygraph():
-        return _C_ops.prelu(x, weight, 'mode', mode, 'data_format', data_format)
+        return _legacy_C_ops.prelu(x, weight, 'mode', mode, 'data_format',
+                                   data_format)
 
     helper = LayerHelper('prelu', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
@@ -659,8 +660,8 @@ def rrelu(x, lower=1. / 8., upper=1. / 3., training=True, name=None):
     is_test = not training
 
     if _in_legacy_dygraph():
-        out, noise = _C_ops.rrelu(x, 'lower', lower, 'upper', upper, 'is_test',
-                                  is_test)
+        out, noise = _legacy_C_ops.rrelu(x, 'lower', lower, 'upper', upper,
+                                         'is_test', is_test)
         return out
 
     helper = LayerHelper('rrelu', **locals())
@@ -705,9 +706,9 @@ def relu(x, name=None):
     """
 
     if in_dygraph_mode():
-        return _C_ops.final_state_relu(x)
-    if _in_legacy_dygraph():
         return _C_ops.relu(x)
+    if _in_legacy_dygraph():
+        return _legacy_C_ops.relu(x)
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'relu')
     helper = LayerHelper('relu', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
@@ -722,9 +723,9 @@ def relu_(x, name=None):
     Please refer to :ref:`api_nn_cn_relu`.
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_relu_(x)
-    if _in_legacy_dygraph():
         return _C_ops.relu_(x)
+    if _in_legacy_dygraph():
+        return _legacy_C_ops.relu_(x)
 
 
 def log_sigmoid(x, name=None):
@@ -754,10 +755,10 @@ def log_sigmoid(x, name=None):
     """
 
     if in_dygraph_mode():
-        return _C_ops.final_state_logsigmoid(x)
+        return _C_ops.logsigmoid(x)
 
     if _in_legacy_dygraph():
-        return _C_ops.logsigmoid(x)
+        return _legacy_C_ops.logsigmoid(x)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
                              'log_sigmoid')
@@ -823,9 +824,9 @@ def maxout(x, groups, axis=1, name=None):
             #    [0.7142536  0.88725346 0.61093384 0.38833922]]]]
     """
     if _in_legacy_dygraph():
-        return _C_ops.maxout(x, 'groups', groups, 'axis', axis)
+        return _legacy_C_ops.maxout(x, 'groups', groups, 'axis', axis)
     if in_dygraph_mode():
-        return _C_ops.final_state_maxout(x, groups, axis)
+        return _C_ops.maxout(x, groups, axis)
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'maxout')
     if axis not in [1, -1, 3]:
         raise ValueError(
@@ -874,9 +875,9 @@ def relu6(x, name=None):
     """
     threshold = 6.0
     if in_dygraph_mode():
-        return _C_ops.final_state_relu6(x, threshold)
+        return _C_ops.relu6(x, threshold)
     if in_dynamic_mode():
-        return _C_ops.relu6(x, 'threshold', threshold)
+        return _legacy_C_ops.relu6(x, 'threshold', threshold)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'relu6')
     helper = LayerHelper('relu6', **locals())
@@ -934,9 +935,9 @@ def selu(x,
             "The alpha must be no less than zero. Received: {}.".format(alpha))
 
     if in_dygraph_mode():
-        return _C_ops.final_state_selu(x, scale, alpha)
+        return _C_ops.selu(x, scale, alpha)
     if _in_legacy_dygraph():
-        return _C_ops.selu(x, 'scale', scale, 'alpha', alpha)
+        return _legacy_C_ops.selu(x, 'scale', scale, 'alpha', alpha)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'selu')
     helper = LayerHelper('selu', **locals())
@@ -978,9 +979,9 @@ def silu(x, name=None):
     """
 
     if in_dygraph_mode():
-        return _C_ops.final_state_silu(x)
-    if _in_legacy_dygraph():
         return _C_ops.silu(x)
+    if _in_legacy_dygraph():
+        return _legacy_C_ops.silu(x)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'silu')
     helper = LayerHelper("silu", **locals())
@@ -1111,13 +1112,14 @@ def softmax(x, axis=-1, dtype=None, name=None):
 
     if in_dygraph_mode():
         outs_cast = x if dtype is None \
-            else _C_ops.final_state_cast(x, dtype)
-        return _C_ops.final_state_softmax(outs_cast, axis)
+            else _C_ops.cast(x, dtype)
+        return _C_ops.softmax(outs_cast, axis)
 
     if _in_legacy_dygraph():
         outs_cast = x if dtype is None \
-            else _C_ops.cast(x, 'in_dtype', x.dtype, 'out_dtype', dtype)
-        return _C_ops.softmax(outs_cast, 'axis', axis, 'use_cudnn', use_cudnn)
+            else _legacy_C_ops.cast(x, 'in_dtype', x.dtype, 'out_dtype', dtype)
+        return _legacy_C_ops.softmax(outs_cast, 'axis', axis, 'use_cudnn',
+                                     use_cudnn)
 
     if dtype is None:
         check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
@@ -1163,13 +1165,14 @@ def softmax_(x, axis=-1, dtype=None, name=None):
 
     if in_dygraph_mode():
         outs_cast = x if dtype is None \
-            else _C_ops.cast(x, 'in_dtype', x.dtype, 'out_dtype', dtype)
-        return _C_ops.final_state_softmax_(outs_cast, axis)
+            else _legacy_C_ops.cast(x, 'in_dtype', x.dtype, 'out_dtype', dtype)
+        return _C_ops.softmax_(outs_cast, axis)
 
     if _in_legacy_dygraph():
         outs_cast = x if dtype is None \
-            else _C_ops.cast(x, 'in_dtype', x.dtype, 'out_dtype', dtype)
-        return _C_ops.softmax_(outs_cast, 'axis', axis, 'use_cudnn', use_cudnn)
+            else _legacy_C_ops.cast(x, 'in_dtype', x.dtype, 'out_dtype', dtype)
+        return _legacy_C_ops.softmax_(outs_cast, 'axis', axis, 'use_cudnn',
+                                      use_cudnn)
 
 
 def softplus(x, beta=1, threshold=20, name=None):
@@ -1203,10 +1206,10 @@ def softplus(x, beta=1, threshold=20, name=None):
     """
 
     if in_dygraph_mode():
-        return _C_ops.final_state_softplus(x, beta, threshold)
+        return _C_ops.softplus(x, beta, threshold)
 
     if _in_legacy_dygraph():
-        return _C_ops.softplus(x, 'beta', beta, 'threshold', threshold)
+        return _legacy_C_ops.softplus(x, 'beta', beta, 'threshold', threshold)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
                              'softplus')
@@ -1262,9 +1265,9 @@ def softshrink(x, threshold=0.5, name=None):
                 threshold))
 
     if in_dygraph_mode():
-        return _C_ops.final_state_soft_shrink(x, threshold)
+        return _C_ops.soft_shrink(x, threshold)
     if _in_legacy_dygraph():
-        return _C_ops.softshrink(x, 'lambda', threshold)
+        return _legacy_C_ops.softshrink(x, 'lambda', threshold)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
                              'softshrink')
@@ -1304,9 +1307,9 @@ def softsign(x, name=None):
             out = F.softsign(x) # [-0.285714, -0.166667, 0.0909091, 0.230769]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_softsign(x)
-    if in_dynamic_mode():
         return _C_ops.softsign(x)
+    if in_dynamic_mode():
+        return _legacy_C_ops.softsign(x)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
                              'softsign')
@@ -1343,9 +1346,9 @@ def swish(x, name=None):
             out = F.swish(x) # [-0.238406, 0., 0.731059]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_swish(x, 1.0)
+        return _C_ops.swish(x, 1.0)
     if _in_legacy_dygraph():
-        return _C_ops.swish(x, 'beta', 1.0)
+        return _legacy_C_ops.swish(x, 'beta', 1.0)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'swish')
     helper = LayerHelper('swish', **locals())
@@ -1388,9 +1391,9 @@ def mish(x, name=None):
             out = F.mish(x) # [-0.03357624, 0., 4.99955208]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_mish(x, 20)
+        return _C_ops.mish(x, 20)
     if _in_legacy_dygraph():
-        return _C_ops.mish(x)
+        return _legacy_C_ops.mish(x)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'mish')
     helper = LayerHelper('mish', **locals())
@@ -1426,10 +1429,10 @@ def tanhshrink(x, name=None):
             out = F.tanhshrink(x) # [-0.020051, -0.00262468, 0.000332005, 0.00868739]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_tanh_shrink(x)
+        return _C_ops.tanh_shrink(x)
 
     if _in_legacy_dygraph():
-        return _C_ops.tanh_shrink(x)
+        return _legacy_C_ops.tanh_shrink(x)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
                              'tanhshrink')
@@ -1475,10 +1478,10 @@ def thresholded_relu(x, threshold=1.0, name=None):
     """
 
     if in_dygraph_mode():
-        return _C_ops.final_state_thresholded_relu(x, threshold)
+        return _C_ops.thresholded_relu(x, threshold)
 
     if _in_legacy_dygraph():
-        return _C_ops.thresholded_relu(x, 'threshold', threshold)
+        return _legacy_C_ops.thresholded_relu(x, 'threshold', threshold)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
                              'thresholded_relu')
@@ -1552,13 +1555,13 @@ def log_softmax(x, axis=-1, dtype=None, name=None):
 
     if in_dygraph_mode():
         if dtype is not None:
-            x = _C_ops.final_state_cast(x, dtype)
-        return _C_ops.final_state_log_softmax(x, axis)
+            x = _C_ops.cast(x, dtype)
+        return _C_ops.log_softmax(x, axis)
 
     if _in_legacy_dygraph():
         if dtype is not None:
-            x = _C_ops.cast(x, 'in_dtype', x.dtype, 'out_dtype', dtype)
-        return _C_ops.log_softmax(x, 'axis', axis)
+            x = _legacy_C_ops.cast(x, 'in_dtype', x.dtype, 'out_dtype', dtype)
+        return _legacy_C_ops.log_softmax(x, 'axis', axis)
 
     if dtype is None:
         check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
@@ -1695,11 +1698,11 @@ def gumbel_softmax(x, temperature=1.0, hard=False, axis=-1, name=None):
         
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_gumbel_softmax(x, temperature, hard, axis)
+        return _C_ops.gumbel_softmax(x, temperature, hard, axis)
 
     if in_dynamic_mode():
-        return _C_ops.gumbel_softmax(x, 'temperature', temperature, 'hard',
-                                     hard, 'axis', axis)
+        return _legacy_C_ops.gumbel_softmax(x, 'temperature', temperature,
+                                            'hard', hard, 'axis', axis)
 
     helper = LayerHelper("gumbel_softmax", **locals())
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'gumbel_softmax')
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index d52a4b225ca75..846427782f12c 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -31,7 +31,7 @@
 
 from ...fluid import dygraph_utils
 
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.framework import in_dynamic_mode
 from paddle.tensor.creation import full
 from paddle.framework import core
@@ -150,8 +150,7 @@ def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
             "of 2 or 4 integers")
 
     if in_dygraph_mode():
-        return _C_ops.final_state_unfold(x, kernel_sizes, strides, paddings,
-                                         dilations)
+        return _C_ops.unfold(x, kernel_sizes, strides, paddings, dilations)
 
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(type="unfold",
@@ -176,7 +175,7 @@ def interpolate(x,
                 name=None):
     """
 
-    This op resizes a batch of images.
+    This API resizes a batch of images.
     The input must be a 3-D Tensor of the shape (num_batches, channels, in_w)
     or 4-D (num_batches, channels, in_h, in_w), or a 5-D Tensor of the shape
     (num_batches, channels, in_d, in_h, in_w) or (num_batches, in_d, in_h, in_w, channels),
@@ -341,46 +340,28 @@ def interpolate(x,
         A 3-D Tensor of the shape (num_batches, channels, out_w) or (num_batches, out_w, channels),
         A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels),
         or 5-D Tensor of the shape (num_batches, channels, out_d, out_h, out_w) or (num_batches, out_d, out_h, out_w, channels).
-    Raises:
-        TypeError: size should be a list or tuple or Tensor.
-        ValueError: The 'mode' of image_resize can only be 'linear', 'bilinear',
-                    'trilinear', 'bicubic', 'area' or 'nearest' currently.
-        ValueError: 'linear' only support 3-D tensor.
-        ValueError: 'bilinear' and 'bicubic' only support 4-D tensor.
-        ValueError: 'nearest' only support 4-D or 5-D tensor.
-        ValueError: 'trilinear' only support 5-D tensor.
-        ValueError: One of size and scale_factor must not be None.
-        ValueError: size length should be 1 for input 3-D tensor.
-        ValueError: size length should be 2 for input 4-D tensor.
-        ValueError: size length should be 3 for input 5-D tensor.
-        ValueError: scale_factor should be greater than zero.
-        TypeError: align_corners should be a bool value
-        ValueError: align_mode can only be '0' or '1'
-        ValueError: data_format can only be 'NCW', 'NWC', 'NCHW', 'NHWC', 'NCDHW' or 'NDHWC'.
+
 
     Examples:
         .. code-block:: python
 
-	        import paddle
-	        import numpy as np
-            import paddle.nn.functional as F
-            
-            # given out size
-            input_data = np.random.rand(2,3,6,10).astype("float32")
-            x = paddle.to_tensor(input_data)
-            output_1 = F.interpolate(x=x, size=[12,12])
-    	    print(output_1.shape)
-	        # [2L, 3L, 12L, 12L]
-            
-            # given scale
-            output_2 = F.interpolate(x=x, scale_factor=[2,1])
-            print(output_2.shape)
-            # [2L, 3L, 12L, 10L]
-            
-            # bilinear interp
-            output_3 = F.interpolate(x=x, scale_factor=[2,1], mode="bilinear")
-            print(output_2.shape)
-            # [2L, 3L, 12L, 10L]
+		import paddle
+		import paddle.nn.functional as F
+
+		input_data = paddle.randn(shape=(2,3,6,10)).astype(paddle.float32)
+		output_1 = F.interpolate(x=input_data, size=[12,12])
+		print(output_1.shape)
+		    # [2L, 3L, 12L, 12L]
+
+		# given scale
+		output_2 = F.interpolate(x=input_data, scale_factor=[2,1])
+		print(output_2.shape)
+		# [2L, 3L, 12L, 10L]
+
+		# bilinear interp
+		output_3 = F.interpolate(x=input_data, scale_factor=[2,1], mode="bilinear")
+		print(output_2.shape)
+		# [2L, 3L, 12L, 10L]
     """
     data_format = data_format.upper()
     resample = mode.upper()
@@ -592,7 +573,7 @@ def _is_list_or_turple_(data):
 
         if resample_type == "linear":
             if in_dygraph_mode():
-                out = _C_ops.final_state_linear_interp(
+                out = _C_ops.linear_interp(
                     x, inputs['OutSize'] if 'OutSize' in inputs else None,
                     inputs['SizeTensor'] if 'SizeTensor' in inputs else None,
                     inputs['Scale'] if 'Scale' in inputs else None,
@@ -601,10 +582,10 @@ def _is_list_or_turple_(data):
                     attrs['interp_method'], attrs['align_corners'],
                     attrs['align_mode'])
             else:
-                out = _C_ops.linear_interp_v2(x, *dy_attr)
+                out = _legacy_C_ops.linear_interp_v2(x, *dy_attr)
         elif resample_type == "bilinear":
             if in_dygraph_mode():
-                out = _C_ops.final_state_bilinear_interp(
+                out = _C_ops.bilinear_interp(
                     x, inputs['OutSize'] if 'OutSize' in inputs else None,
                     inputs['SizeTensor'] if 'SizeTensor' in inputs else None,
                     inputs['Scale'] if 'Scale' in inputs else None,
@@ -613,10 +594,10 @@ def _is_list_or_turple_(data):
                     attrs['interp_method'], attrs['align_corners'],
                     attrs['align_mode'])
             else:
-                out = _C_ops.bilinear_interp_v2(x, *dy_attr)
+                out = _legacy_C_ops.bilinear_interp_v2(x, *dy_attr)
         elif resample_type == "trilinear":
             if in_dygraph_mode():
-                out = _C_ops.final_state_trilinear_interp(
+                out = _C_ops.trilinear_interp(
                     x, inputs['OutSize'] if 'OutSize' in inputs else None,
                     inputs['SizeTensor'] if 'SizeTensor' in inputs else None,
                     inputs['Scale'] if 'Scale' in inputs else None,
@@ -625,10 +606,10 @@ def _is_list_or_turple_(data):
                     attrs['interp_method'], attrs['align_corners'],
                     attrs['align_mode'])
             else:
-                out = _C_ops.trilinear_interp_v2(x, *dy_attr)
+                out = _legacy_C_ops.trilinear_interp_v2(x, *dy_attr)
         elif resample_type == "nearest":
             if in_dygraph_mode():
-                out = _C_ops.final_state_nearest_interp(
+                out = _C_ops.nearest_interp(
                     x, inputs['OutSize'] if 'OutSize' in inputs else None,
                     inputs['SizeTensor'] if 'SizeTensor' in inputs else None,
                     inputs['Scale'] if 'Scale' in inputs else None,
@@ -637,10 +618,10 @@ def _is_list_or_turple_(data):
                     attrs['interp_method'], attrs['align_corners'],
                     attrs['align_mode'])
             else:
-                out = _C_ops.nearest_interp_v2(x, *dy_attr)
+                out = _legacy_C_ops.nearest_interp_v2(x, *dy_attr)
         elif resample_type == "bicubic":
             if in_dygraph_mode():
-                out = _C_ops.final_state_bicubic_interp(
+                out = _C_ops.bicubic_interp(
                     x, inputs['OutSize'] if 'OutSize' in inputs else None,
                     inputs['SizeTensor'] if 'SizeTensor' in inputs else None,
                     inputs['Scale'] if 'Scale' in inputs else None,
@@ -649,7 +630,7 @@ def _is_list_or_turple_(data):
                     attrs['interp_method'], attrs['align_corners'],
                     attrs['align_mode'])
             else:
-                out = _C_ops.bicubic_interp_v2(x, *dy_attr)
+                out = _legacy_C_ops.bicubic_interp_v2(x, *dy_attr)
         return out
     out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(type='{}_interp_v2'.format(resample_type),
@@ -668,7 +649,7 @@ def upsample(x,
              data_format='NCHW',
              name=None):
     """
-    This op resizes a batch of images.
+    This API resizes a batch of images.
 
     The input must be a 3-D Tensor of the shape (num_batches, channels, in_w)
     or 4-D (num_batches, channels, in_h, in_w), or a 5-D Tensor of the shape
@@ -716,6 +697,7 @@ def upsample(x,
 
     Example:
     .. code-block:: text
+    
         For scale_factor:
             if align_corners = True && out_size > 1 :
               scale_factor = (in_size-1.0)/(out_size-1.0)
@@ -801,23 +783,23 @@ def upsample(x,
     Parameters:
         x (Tensor): 3-D, 4-D or 5-D Tensor, its data type is float32, float64, or uint8,
                           its data format is specified by :attr:`data_format`.
-        size (list|tuple|Tensor|None): Output shape of image resize
+        size (list|tuple|Tensor|None, optional): Output shape of image resize
              layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w) 
              when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
              Default: None. If a list/tuple, each element can be an integer or a Tensor of shape: [1].
              If a Tensor , its dimensions size should be a 1.
-        scale_factor (float|Tensor|list|tuple|None): The multiplier for the input height or width. At
+        scale_factor (float|Tensor|list|tuple|None, optional): The multiplier for the input height or width. At
              least one of :attr:`size` or :attr:`scale_factor` must be set.
              And :attr:`size` has a higher priority than :attr:`scale_factor`.Has to match input size if 
              it is either a list or a tuple or a Tensor.
              Default: None.
-        mode (str): The resample method. It supports 'linear', 'nearest', 'bilinear',
+        mode (str, optional): The resample method. It supports 'linear', 'nearest', 'bilinear',
                        'bicubic' and 'trilinear' currently. Default: 'nearest'
-        align_corners(bool) :  An optional bool, If True, the centers of the 4 corner pixels of the
+        align_corners(bool, optional) :  An optional bool, If True, the centers of the 4 corner pixels of the
                                input and output tensors are aligned, preserving the values at the
                                corner pixels.
                                Default: False
-        align_mode(int)  :  An optional for linear/bilinear/trilinear interpolation. Refer to the formula in the example above,
+        align_mode(int, optional)  :  An optional for linear/bilinear/trilinear interpolation. Refer to the formula in the example above,
                             it can be \'0\' for src_idx = scale_factor*(dst_indx+0.5)-0.5 , can be \'1\' for
                             src_idx = scale_factor*dst_index.
         data_format (str, optional): Specify the data format of the input, and the data format of the output
@@ -832,32 +814,19 @@ def upsample(x,
         A 3-D Tensor of the shape (num_batches, channels, out_w) or (num_batches, out_w, channels),
         A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels),
         or 5-D Tensor of the shape (num_batches, channels, out_d, out_h, out_w) or (num_batches, out_d, out_h, out_w, channels).
-    Raises:
-        TypeError: size should be a list or tuple or Tensor.
-        ValueError: The 'mode' of image_resize can only be 'linear', 'bilinear',
-                    'trilinear', 'bicubic', or 'nearest' currently.
-        ValueError: 'linear' only support 3-D tensor.
-        ValueError: 'bilinear', 'bicubic' and 'nearest' only support 4-D tensor.
-        ValueError: 'trilinear' only support 5-D tensor.
-        ValueError: One of size and scale_factor must not be None.
-        ValueError: size length should be 1 for input 3-D tensor.
-        ValueError: size length should be 2 for input 4-D tensor.
-        ValueError: size length should be 3 for input 5-D tensor.
-        ValueError: scale_factor should be greater than zero.
-        TypeError: align_corners should be a bool value
-        ValueError: align_mode can only be '0' or '1'
-        ValueError: data_format can only be 'NCW', 'NWC', 'NCHW', 'NHWC', 'NCDHW' or 'NDHWC'.
+
         Examples:
         .. code-block:: python
-            import paddle
-            import numpy as np
-            import paddle.nn.functional as F
+	
+		import paddle
+		import paddle.nn as nn
+
+		input_data = paddle.randn(shape=(2,3,6,10)).astype(paddle.float32)
+		upsample_out = paddle.nn.Upsample(size=[12,12])
 
-            input_data = np.random.rand(2,3,6,10).astype("float32")
-            input = paddle.to_tensor(input_data)
-            output = F.upsample(x=input, size=[12,12])
-            print(output.shape)
-            # [2L, 3L, 12L, 12L]
+		output = upsample_out(x=input_data)
+		print(output.shape)
+		# [2L, 3L, 12L, 12L]
 
     """
     return interpolate(x, size, scale_factor, mode, align_corners, align_mode,
@@ -884,23 +853,23 @@ def bilinear(x1, x2, weight, bias=None, name=None):
     Examples:
        .. code-block:: python
 
-        import paddle
-        import numpy
-        import paddle.nn.functional as F
-
-        x1 = numpy.random.random((5, 5)).astype('float32')
-        x2 = numpy.random.random((5, 4)).astype('float32')
-        w = numpy.random.random((1000, 5, 4)).astype('float32')
-        b = numpy.random.random((1, 1000)).astype('float32')
+		import paddle
+		import paddle.nn.functional as F
 
-        result = F.bilinear(paddle.to_tensor(x1), paddle.to_tensor(x2), paddle.to_tensor(w), paddle.to_tensor(b))           # result shape [5, 1000]
+		x1 = paddle.randn((5, 5)).astype(paddle.float32)
+		x2 = paddle.randn((5, 4)).astype(paddle.float32)
+		w = paddle.randn((1000, 5, 4)).astype(paddle.float32)
+		b = paddle.randn((1, 1000)).astype(paddle.float32)
 
+		result = F.bilinear(x1, x2, w, b)
+		print(result.shape)
+		# [5, 1000]
     """
 
     if in_dygraph_mode():
-        return _C_ops.final_state_bilinear_tensor_product(x1, x2, weight, bias)
-    elif _non_static_mode():
         return _C_ops.bilinear_tensor_product(x1, x2, weight, bias)
+    elif _non_static_mode():
+        return _legacy_C_ops.bilinear_tensor_product(x1, x2, weight, bias)
 
     check_variable_and_dtype(x1, 'x1', ['float32', 'float64'], 'bilinear')
     check_variable_and_dtype(x2, 'x2', ['float32', 'float64'], 'bilinear')
@@ -933,10 +902,10 @@ def dropout(x,
 
     Args:
         x (Tensor): The input tensor. The data type is float32 or float64.
-        p (float|int): Probability of setting units to zero. Default 0.5.
-        axis (int|list|tuple): The axis along which the dropout is performed. Default None.
-        training (bool): A flag indicating whether it is in train phrase or not. Default True.
-        mode(str): ['upscale_in_train'(default) | 'downscale_in_infer'].
+        p (float|int, optional): Probability of setting units to zero. Default 0.5.
+        axis (int|list|tuple, optional): The axis along which the dropout is performed. Default None.
+        training (bool, optional): A flag indicating whether it is in train phrase or not. Default True.
+        mode(str, optional): ['upscale_in_train'(default) | 'downscale_in_infer'].
 
                            1. upscale_in_train(default), upscale the output at training time
 
@@ -1036,22 +1005,38 @@ def dropout(x,
 
         .. code-block:: python
 
-            import paddle
-            import numpy as np
-
-            x = np.array([[1,2,3], [4,5,6]]).astype('float32')
-            x = paddle.to_tensor(x)
-            y_train = paddle.nn.functional.dropout(x, 0.5)
-            y_test = paddle.nn.functional.dropout(x, 0.5, training=False) 
-            y_0 = paddle.nn.functional.dropout(x, axis=0)
-            y_1 = paddle.nn.functional.dropout(x, axis=1)
-            y_01 = paddle.nn.functional.dropout(x, axis=[0,1])
-            print(x)
-            print(y_train)
-            print(y_test)
-            print(y_0)
-            print(y_1)
-            print(y_01)
+		import paddle
+
+		x = paddle.to_tensor([[1,2,3], [4,5,6]]).astype(paddle.float32)
+		y_train = paddle.nn.functional.dropout(x, 0.5)
+		y_test = paddle.nn.functional.dropout(x, 0.5, training=False)
+		y_0 = paddle.nn.functional.dropout(x, axis=0)
+		y_1 = paddle.nn.functional.dropout(x, axis=1)
+		y_01 = paddle.nn.functional.dropout(x, axis=[0,1])
+		print(x)
+		# Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+		#        [[1., 2., 3.],
+		#         [4., 5., 6.]])
+		print(y_train)
+		# Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+		#        [[2. , 0. , 6. ],
+		#         [8. , 0. , 12.]])
+		print(y_test)
+		# Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+		#        [[1., 2., 3.],
+		#         [4., 5., 6.]])
+		print(y_0)
+		# Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+		#        [[0. , 0. , 0. ],
+		#         [8. , 10., 12.]])
+		print(y_1)
+		# Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+		#        [[2. , 0. , 6. ],
+		#         [8. , 0. , 12.]])
+		print(y_01)
+		# Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+		#        [[0. , 0. , 0. ],
+		#         [8. , 0. , 12.]])
 
     """
     if not isinstance(p, (float, int, Variable)):
@@ -1078,15 +1063,15 @@ def dropout(x,
                 seed = default_main_program().random_seed
 
             if in_dygraph_mode():
-                out, mask = _C_ops.final_state_dropout( x, None, p, not training, mode, \
+                out, mask = _C_ops.dropout( x, None, p, not training, mode, \
                     seed if seed is not None else 0, seed is not None)
 
                 return out
-            out, mask = _C_ops.dropout(x, 'dropout_prob', p, 'is_test',
-                                       not training, 'fix_seed', seed
-                                       is not None, 'seed',
-                                       seed if seed is not None else 0,
-                                       'dropout_implementation', mode)
+            out, mask = _legacy_C_ops.dropout(x, 'dropout_prob', p, 'is_test',
+                                              not training, 'fix_seed', seed
+                                              is not None, 'seed',
+                                              seed if seed is not None else 0,
+                                              'dropout_implementation', mode)
             return out
 
         helper = LayerHelper('dropout', **locals())
@@ -1199,17 +1184,16 @@ def dropout2d(x, p=0.5, training=True, data_format='NCHW', name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
-            x = np.random.random(size=(2, 3, 4, 5)).astype('float32')
-            x = paddle.to_tensor(x)
+            x = paddle.randn(shape=(2, 3, 4, 5)).astype(paddle.float32)
             y_train = paddle.nn.functional.dropout2d(x)  #train
             y_test = paddle.nn.functional.dropout2d(x, training=False) #test
             for i in range(2):
                 for j in range(3):
-                    print(x.numpy()[i,j,:,:])
-                    print(y_train.numpy()[i,j,:,:]) # may all 0
-                    print(y_test.numpy()[i,j,:,:])
+                    print(x[i,j,:,:])
+                    print(y_train[i,j,:,:]) # may all 0
+                    print(y_test[i,j,:,:])
+
     """
     input_shape = x.shape
     if len(input_shape) != 4:
@@ -1252,16 +1236,15 @@ def dropout3d(x, p=0.5, training=True, data_format='NCDHW', name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import numpy as np
+		import paddle
+
+		x = paddle.randn(shape=(2, 3, 4, 5, 6)).astype(paddle.float32)
+		y_train = paddle.nn.functional.dropout3d(x)  #train
+		y_test = paddle.nn.functional.dropout3d(x, training=False) #test
+		print(x[0,0,:,:,:])
+		print(y_train[0,0,:,:,:]) # may all 0
+		print(y_test[0,0,:,:,:])
 
-            x = np.random.random(size=(2, 3, 4, 5, 6)).astype('float32')
-            x = paddle.to_tensor(x)
-            y_train = paddle.nn.functional.dropout3d(x)  #train
-            y_test = paddle.nn.functional.dropout3d(x, training=False) #test
-            print(x.numpy()[0,0,:,:,:])
-            print(y_train.numpy()[0,0,:,:,:]) # may all 0
-            print(y_test.numpy()[0,0,:,:,:])
     """
 
     input_shape = x.shape
@@ -1301,17 +1284,19 @@ def alpha_dropout(x, p=0.5, training=True, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import numpy as np
-
-            x = np.array([[-1, 1], [-1, 1]]).astype('float32')
-            x = paddle.to_tensor(x)
-            y_train = paddle.nn.functional.alpha_dropout(x, 0.5)
-            y_test = paddle.nn.functional.alpha_dropout(x, 0.5, training=False)
-            print(x)
-            print(y_train)
-            # [[-0.10721093, 1.6655989 ], [-0.7791938, -0.7791938]] (randomly)
-            print(y_test)
+		import paddle
+
+		x = paddle.to_tensor([[-1, 1], [-1, 1]]).astype(paddle.float32)
+		y_train = paddle.nn.functional.alpha_dropout(x, 0.5)
+		y_test = paddle.nn.functional.alpha_dropout(x, 0.5, training=False)
+		print(y_train)
+		# Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+		#        [[-0.10721093, -0.77919382],
+		#         [-0.10721093,  1.66559887]]) (randomly)
+		print(y_test)
+		# Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+		#        [[-1.,  1.],
+		#         [-1.,  1.]])
     """
     if not isinstance(p, (float, int)):
         raise TypeError("p argument should be a float or int")
@@ -1484,7 +1469,7 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
         pad_value = value
 
         if in_dygraph_mode():
-            out = _C_ops.final_state_pad(x, paddings, float(pad_value))
+            out = _C_ops.pad(x, paddings, float(pad_value))
             return out
 
         check_variable_and_dtype(x, 'x', [
@@ -1566,13 +1551,14 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
     if in_dygraph_mode():
         if isinstance(pad, Variable):
             pad = pad.numpy().tolist()
-        out = _C_ops.final_state_pad3d(x, pad, mode, value, data_format)
+        out = _C_ops.pad3d(x, pad, mode, value, data_format)
     else:
         if _in_legacy_dygraph():
             if isinstance(pad, Variable):
                 pad = pad.numpy().tolist()
-            out = _C_ops.pad3d(x, "paddings", pad, "mode", mode, "value", value,
-                               "data_format", data_format, "name", name)
+            out = _legacy_C_ops.pad3d(x, "paddings", pad, "mode", mode, "value",
+                                      value, "data_format", data_format, "name",
+                                      name)
         else:
             attrs = {'mode': mode, 'value': value, 'data_format': data_format}
             inputs = {'X': [x]}
@@ -1743,16 +1729,16 @@ def linear(x, weight, bias=None, name=None):
     """
     if in_dygraph_mode():
         #TODO(jiabin): using addmm for fast forward route
-        return _C_ops.final_state_linear(x, weight, bias)
+        return _C_ops.linear(x, weight, bias)
     else:
         if _in_legacy_dygraph():
-            pre_bias = _C_ops.matmul_v2(x, weight, 'trans_x', False, 'trans_y',
-                                        False)
+            pre_bias = _legacy_C_ops.matmul_v2(x, weight, 'trans_x', False,
+                                               'trans_y', False)
 
             if bias is None:
                 return pre_bias
 
-            return _C_ops.elementwise_add(pre_bias, bias)
+            return _legacy_C_ops.elementwise_add(pre_bias, bias)
         else:
             helper = LayerHelper('linear', **locals())
             dtype = x.dtype
@@ -1845,11 +1831,11 @@ def label_smooth(label, prior_dist=None, epsilon=0.1, name=None):
         raise ValueError("The value of epsilon must be between 0 and 1.")
 
     if in_dygraph_mode():
-        return _C_ops.final_state_label_smooth(label, prior_dist,
-                                               float(epsilon))
+        return _C_ops.label_smooth(label, prior_dist, float(epsilon))
 
     elif paddle.in_dynamic_mode():
-        return _C_ops.label_smooth(label, prior_dist, 'epsilon', float(epsilon))
+        return _legacy_C_ops.label_smooth(label, prior_dist, 'epsilon',
+                                          float(epsilon))
 
     check_variable_and_dtype(label, 'label', ['float32', 'float64'],
                              'label_smooth')
@@ -2013,11 +1999,12 @@ class centers and the shape of sampled_class_center will be [num_positive_class_
         seed = default_main_program().random_seed
 
     if in_dygraph_mode():
-        return _C_ops.final_state_class_center_sample(
-            label, num_classes, num_samples, ring_id, rank, nranks, seed
-            is not None, seed if seed is not None else 0)
+        return _C_ops.class_center_sample(label, num_classes, num_samples,
+                                          ring_id, rank, nranks, seed
+                                          is not None,
+                                          seed if seed is not None else 0)
     elif paddle.in_dynamic_mode():
-        remapped_label, sampled_class_center = _C_ops.class_center_sample(
+        remapped_label, sampled_class_center = _legacy_C_ops.class_center_sample(
             label, 'num_classes', num_classes, 'num_samples', num_samples,
             'ring_id', ring_id, 'nranks', nranks, 'rank', rank, 'fix_seed', seed
             is not None, 'seed', seed if seed is not None else 0)
@@ -2058,7 +2045,7 @@ def fold(x,
          name=None):
     r"""
     
-    This Op is used to combines an array of sliding local blocks into a large containing
+    Combines an array of sliding local blocks into a large containing
     tensor. also known as col2im when operated on batched 2D image tensor. Fold calculates each 
     combined value in the resulting large tensor by summing all values from all containing blocks. 
 
@@ -2067,9 +2054,10 @@ def fold(x,
     can be calculated as following.
 
     .. math::
-        H_out &= output_size[0]
-        W_out &= output_size[1]
-        C_out &= C_in / kernel\_sizes[0] / kernel\_sizes[1]
+    
+        H_{out} &= output\_size[0] \\
+        W_{out} &= output\_size[1] \\
+        C_{out} &= \frac{C_{in}}{kernel\_sizes[0]\times kernel\_sizes[1]} \\
 
     Parameters:
         x(Tensor):                3-D Tensor, input tensor of format [N, C, L],
@@ -2078,17 +2066,17 @@ def fold(x,
                                   or an interger o treated as [o, o].
         kernel_sizes(int|list|tuple):   The size of convolution kernel, should be [k_h, k_w]
                                   or an integer k treated as [k, k].
-        strides(int|list|tuple):        The strides, should be [stride_h, stride_w]
+        strides(int|list|tuple, optional):        The strides, should be [stride_h, stride_w]
                                   or an integer stride treated as [sride, stride].
                                   For default, strides will be [1, 1].
-        paddings(int|list|tuple):       The paddings of each dimension, should be
+        paddings(int|list|tuple, optional):       The paddings of each dimension, should be
                                   [padding_top, padding_left, padding_bottom, padding_right]
                                   or [padding_h, padding_w] or an integer padding.
                                   If [padding_h, padding_w] was given, it will expanded to
                                   [padding_h, padding_w, padding_h, padding_w]. If an integer
                                   padding was given, [padding, padding, padding, padding] will
                                   be used. For default, paddings will be [0, 0, 0, 0]
-        dilations(int|list|tuple):      the dilations of convolution kernel, should be
+        dilations(int|list|tuple, optional):      the dilations of convolution kernel, should be
                                   [dilation_h, dilation_w], or an integer dilation treated as
                                   [dilation, dilation]. For default, it will be [1, 1].
         name(str, optional): The default value is None.
@@ -2164,12 +2152,13 @@ def _is_list_or_turple_(data):
             "of 2 or 4 integers")
 
     if in_dygraph_mode():
-        out = _C_ops.final_state_fold(x, output_sizes, kernel_sizes, strides,
-                                      paddings, dilations)
+        out = _C_ops.fold(x, output_sizes, kernel_sizes, strides, paddings,
+                          dilations)
     elif in_dynamic_mode():
-        out = _C_ops.fold(x, "output_sizes", output_sizes, "kernel_sizes",
-                          kernel_sizes, "strides", strides, "paddings",
-                          paddings, "dilations", dilations)
+        out = _legacy_C_ops.fold(x, "output_sizes", output_sizes,
+                                 "kernel_sizes", kernel_sizes, "strides",
+                                 strides, "paddings", paddings, "dilations",
+                                 dilations)
     else:
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
         helper.append_op(type="fold",
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index ef80c13f07f8d..5cdd8732f6c3a 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -24,7 +24,7 @@
 from ...tensor.manipulation import unsqueeze, squeeze
 from ...tensor.math import add
 from ...fluid.layers import nn
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle import get_flags
 from paddle import in_dynamic_mode
 from paddle.device import is_compiled_with_cuda
@@ -123,10 +123,9 @@ def _conv_nd(x,
 
     # Due to the poor performance of NHWC, we transpose the input to NCHW.
     if in_dygraph_mode() and op_type == "conv2d":
-        pre_bias = _C_ops.final_state_conv2d(x, weight, stride, padding,
-                                             padding_algorithm, groups,
-                                             dilation, data_format, False, -1,
-                                             False)
+        pre_bias = _C_ops.conv2d(x, weight, stride, padding, padding_algorithm,
+                                 groups, dilation, data_format, False, -1,
+                                 False)
         if bias is not None:
             channel_dim = channel_dim + len(
                 x.shape) if channel_dim < 0 else channel_dim
@@ -135,41 +134,41 @@ def _conv_nd(x,
             if isinstance(bias, tuple):
                 bias = bias[0]
             if len(bias.shape) < len(x.shape):
-                tmp_bias = _C_ops.final_state_reshape(
+                tmp_bias = _C_ops.reshape(
                     bias, bias.shape +
                     [1 for i in range(len(x.shape) - channel_dim - 1)])
-                return _C_ops.final_state_add(pre_bias, tmp_bias)
+                return _C_ops.add(pre_bias, tmp_bias)
             else:
-                return _C_ops.final_state_add(pre_bias, bias)
+                return _C_ops.add(pre_bias, bias)
         else:
             return pre_bias
 
     if in_dygraph_mode() and op_type == "depthwise_conv2d":
-        pre_bias = _C_ops.final_state_depthwise_conv2d(
-            x, weight, stride, padding, padding_algorithm, groups, dilation,
-            data_format, False, -1, False, False, use_cudnn)
+        pre_bias = _C_ops.depthwise_conv2d(x, weight, stride, padding,
+                                           padding_algorithm, groups, dilation,
+                                           data_format, False, -1, False, False,
+                                           use_cudnn)
         if bias is not None:
             channel_dim = channel_dim + len(
                 x.shape) if channel_dim < 0 else channel_dim
-            tmp_bias = _C_ops.final_state_reshape(
+            tmp_bias = _C_ops.reshape(
                 bias,
                 bias.shape + [1 for i in range(len(x.shape) - channel_dim - 1)])
-            return _C_ops.final_state_add(pre_bias, tmp_bias)
+            return _C_ops.add(pre_bias, tmp_bias)
         else:
             return pre_bias
 
     if in_dygraph_mode() and op_type == "conv3d":
-        pre_bias = _C_ops.final_state_conv3d(x, weight, stride, padding,
-                                             padding_algorithm, groups,
-                                             dilation, data_format, False, -1,
-                                             False)
+        pre_bias = _C_ops.conv3d(x, weight, stride, padding, padding_algorithm,
+                                 groups, dilation, data_format, False, -1,
+                                 False)
         if bias is not None:
             channel_dim = channel_dim + len(
                 x.shape) if channel_dim < 0 else channel_dim
-            tmp_bias = _C_ops.final_state_reshape(
+            tmp_bias = _C_ops.reshape(
                 bias,
                 bias.shape + [1 for i in range(len(x.shape) - channel_dim - 1)])
-            return _C_ops.final_state_add(pre_bias, tmp_bias)
+            return _C_ops.add(pre_bias, tmp_bias)
         else:
             return pre_bias
 
@@ -179,7 +178,7 @@ def _conv_nd(x,
                  use_mkldnn, 'fuse_relu_before_depthwise_conv', False,
                  "padding_algorithm", padding_algorithm, "data_format",
                  data_format)
-        pre_bias = getattr(_C_ops, op_type)(x, weight, *attrs)
+        pre_bias = getattr(_legacy_C_ops, op_type)(x, weight, *attrs)
         if bias is not None:
             out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
         else:
@@ -422,7 +421,6 @@ def conv1d(x,
     x = unsqueeze(x, axis=[squeeze_aixs])
 
     if in_dygraph_mode():
-        l_type = "final_state_" + l_type
         out = getattr(_C_ops,
                       l_type)(x, weight, stride, padding, padding_algorithm,
                               groups, dilation, conv2d_data_format, False, -1,
@@ -434,7 +432,7 @@ def conv1d(x,
                  'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn', False,
                  'fuse_relu_before_depthwise_conv', False, "padding_algorithm",
                  padding_algorithm, "data_format", conv2d_data_format)
-        out = getattr(_C_ops, l_type)(x, weight, *attrs)
+        out = getattr(_legacy_C_ops, l_type)(x, weight, *attrs)
         if bias is not None:
             out = nn.elementwise_add(out, bias, axis=channel_dim)
     else:
@@ -645,10 +643,9 @@ def conv2d(x,
             use_cudnn = False
     else:
         if in_dygraph_mode():
-            pre_bias = _C_ops.final_state_conv2d(x, weight, stride, padding,
-                                                 padding_algorithm, groups,
-                                                 dilation, data_format, False,
-                                                 -1, False)
+            pre_bias = _C_ops.conv2d(x, weight, stride, padding,
+                                     padding_algorithm, groups, dilation,
+                                     data_format, False, -1, False)
             if bias is not None:
                 out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
                 return out
@@ -901,7 +898,6 @@ def conv1d_transpose(x,
     weight = unsqueeze(weight, axis=[-1])
 
     if in_dygraph_mode():
-        op_type = "final_state_" + op_type
         out = getattr(_C_ops,
                       op_type)(x, weight, stride, padding, output_padding,
                                output_size, padding_algorithm, groups, dilation,
@@ -913,7 +909,7 @@ def conv1d_transpose(x,
                  'strides', stride, 'paddings', padding, 'padding_algorithm',
                  padding_algorithm, 'dilations', dilation, 'groups', groups,
                  'use_cudnn', use_cudnn, 'data_format', conv2d_data_format)
-        out = getattr(_C_ops, op_type)(x, weight, *attrs)
+        out = getattr(_legacy_C_ops, op_type)(x, weight, *attrs)
         if bias is not None:
             out = nn.elementwise_add(out, bias, axis=channel_dim)
     else:
@@ -1153,10 +1149,9 @@ def conv2d_transpose(x,
         use_cudnn = False
 
     if in_dygraph_mode():
-        final_state_op = _C_ops.final_state_conv2d_transpose if op_type == 'conv2d_transpose' else _C_ops.final_state_depthwise_conv2d_transpose
-        pre_bias = final_state_op(x, weight, stride, padding, output_padding,
-                                  output_size, padding_algorithm, groups,
-                                  dilation, data_format)
+        op = _C_ops.conv2d_transpose if op_type == 'conv2d_transpose' else _C_ops.depthwise_conv2d_transpose
+        pre_bias = op(x, weight, stride, padding, output_padding, output_size,
+                      padding_algorithm, groups, dilation, data_format)
         if bias is not None:
             return nn.elementwise_add(pre_bias, bias, axis=channel_dim)
         else:
@@ -1167,7 +1162,7 @@ def conv2d_transpose(x,
                  'strides', stride, 'paddings', padding, 'padding_algorithm',
                  padding_algorithm, 'dilations', dilation, 'groups', groups,
                  'use_cudnn', use_cudnn, 'data_format', data_format)
-        pre_bias = getattr(_C_ops, op_type)(x, weight, *attrs)
+        pre_bias = getattr(_legacy_C_ops, op_type)(x, weight, *attrs)
         if bias is not None:
             out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
         else:
@@ -1572,9 +1567,10 @@ def conv3d_transpose(x,
     data_format_ = "NHWC" if channel_last else "NCHW"
 
     if in_dygraph_mode():
-        pre_bias = _C_ops.final_state_conv3d_transpose(
-            x, weight, stride, padding, output_padding, output_size,
-            padding_algorithm, groups, dilation, data_format_)
+        pre_bias = _C_ops.conv3d_transpose(x, weight, stride, padding,
+                                           output_padding, output_size,
+                                           padding_algorithm, groups, dilation,
+                                           data_format_)
         if bias is not None:
             return nn.elementwise_add(pre_bias, bias, axis=channel_dim)
         else:
@@ -1585,7 +1581,7 @@ def conv3d_transpose(x,
                  'paddings', padding, "padding_algorithm", padding_algorithm,
                  'strides', stride, 'dilations', dilation, 'groups', groups,
                  'use_cudnn', use_cudnn, "data_format", data_format_)
-        pre_bias = getattr(_C_ops, op_type)(x, weight, *attrs)
+        pre_bias = getattr(_legacy_C_ops, op_type)(x, weight, *attrs)
         if bias is not None:
             out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
         else:
diff --git a/python/paddle/nn/functional/distance.py b/python/paddle/nn/functional/distance.py
index 4d6f447d6737f..1c29d50974101 100644
--- a/python/paddle/nn/functional/distance.py
+++ b/python/paddle/nn/functional/distance.py
@@ -15,7 +15,7 @@
 import paddle
 from ...fluid.data_feeder import check_variable_and_dtype, check_type
 from ...fluid.layer_helper import LayerHelper
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid.framework import in_dygraph_mode, _in_legacy_dygraph
 
 __all__ = []
@@ -67,22 +67,22 @@ def pairwise_distance(x, y, p=2., epsilon=1e-6, keepdim=False, name=None):
     check_type(epsilon, 'epsilon', (float), 'PairwiseDistance')
     check_type(keepdim, 'keepdim', (bool), 'PairwiseDistance')
     if in_dygraph_mode():
-        sub = _C_ops.final_state_subtract(x, y)
+        sub = _C_ops.subtract(x, y)
         # p_norm op has not uesd epsilon, so change it to the following.
         if epsilon != 0.0:
             epsilon = paddle.fluid.dygraph.base.to_variable([epsilon],
                                                             dtype=sub.dtype)
-            sub = _C_ops.final_state_add(sub, epsilon)
-        return _C_ops.final_state_p_norm(sub, p, -1, 0., keepdim, False)
+            sub = _C_ops.add(sub, epsilon)
+        return _C_ops.p_norm(sub, p, -1, 0., keepdim, False)
 
     if _in_legacy_dygraph():
-        sub = _C_ops.elementwise_sub(x, y)
+        sub = _legacy_C_ops.elementwise_sub(x, y)
         if epsilon != 0.0:
             epsilon = paddle.fluid.dygraph.base.to_variable([epsilon],
                                                             dtype=sub.dtype)
-            sub = _C_ops.elementwise_add(sub, epsilon)
-        return _C_ops.p_norm(sub, 'axis', -1, 'porder', p, 'keepdim', keepdim,
-                             'epsilon', 0.)
+            sub = _legacy_C_ops.elementwise_add(sub, epsilon)
+        return _legacy_C_ops.p_norm(sub, 'axis', -1, 'porder', p, 'keepdim',
+                                    keepdim, 'epsilon', 0.)
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'PairwiseDistance')
     check_variable_and_dtype(y, 'y', ['float32', 'float64'], 'PairwiseDistance')
diff --git a/python/paddle/nn/functional/extension.py b/python/paddle/nn/functional/extension.py
index 995ba19058842..7ae35666c8612 100644
--- a/python/paddle/nn/functional/extension.py
+++ b/python/paddle/nn/functional/extension.py
@@ -22,7 +22,7 @@
 from ...fluid import dygraph_utils
 from ...tensor.layer_function_generator import templatedoc
 from paddle import in_dynamic_mode
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from ...fluid.framework import _non_static_mode, _in_legacy_dygraph, in_dygraph_mode
 from ...fluid.data_feeder import check_variable_and_dtype, check_type
 from ...framework import core
@@ -102,10 +102,10 @@ def diag_embed(input, offset=0, dim1=-2, dim2=-1):
         input = assign(input)
 
     if in_dygraph_mode():
-        return _C_ops.final_state_diag_embed(input, offset, dim1, dim2)
+        return _C_ops.diag_embed(input, offset, dim1, dim2)
     elif in_dynamic_mode():
-        return _C_ops.diag_embed(input, "offset", offset, "dim1", dim1, "dim2",
-                                 dim2)
+        return _legacy_C_ops.diag_embed(input, "offset", offset, "dim1", dim1,
+                                        "dim2", dim2)
 
     inputs = {'Input': [input]}
     attrs = {'offset': offset, 'dim1': dim1, 'dim2': dim2}
@@ -217,10 +217,10 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None):
         if maxlen is not None:
             if isinstance(maxlen, core.eager.Tensor):
                 attrs = ('out_dtype', dtype)
-                out = _C_ops.sequence_mask(x, maxlen, *attrs)
+                out = _legacy_C_ops.sequence_mask(x, maxlen, *attrs)
             else:
                 attrs = ('out_dtype', dtype, 'maxlen', maxlen)
-                out = _C_ops.sequence_mask(x, None, *attrs)
+                out = _legacy_C_ops.sequence_mask(x, None, *attrs)
             out.stop_gradient = True
             return out
 
@@ -306,10 +306,10 @@ def gather_tree(ids, parents):
 
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_gather_tree(ids, parents)
+        return _C_ops.gather_tree(ids, parents)
     else:
         if _in_legacy_dygraph():
-            return _C_ops.gather_tree(ids, parents)
+            return _legacy_C_ops.gather_tree(ids, parents)
         else:
             helper = LayerHelper('gather_tree', **locals())
             check_variable_and_dtype(ids, 'ids', ['int32', 'int64'],
@@ -366,11 +366,11 @@ def temporal_shift(x, seg_num, shift_ratio=0.25, name=None, data_format="NCHW"):
         raise ValueError("Attr(data_format) should be 'NCHW' or 'NHWC'. "
                          "Received Attr(data_format): {}.".format(data_format))
     if in_dygraph_mode():
-        return _C_ops.final_state_temporal_shift(x, seg_num, shift_ratio,
-                                                 data_format)
+        return _C_ops.temporal_shift(x, seg_num, shift_ratio, data_format)
     if _non_static_mode():
-        return _C_ops.temporal_shift(x, 'seg_num', seg_num, 'shift_ratio',
-                                     shift_ratio, 'data_format', data_format)
+        return _legacy_C_ops.temporal_shift(x, 'seg_num', seg_num,
+                                            'shift_ratio', shift_ratio,
+                                            'data_format', data_format)
 
     helper = LayerHelper("temporal_shift", **locals())
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'temporal_shift')
diff --git a/python/paddle/nn/functional/input.py b/python/paddle/nn/functional/input.py
index 01a5f991f420e..0ed7d314b0836 100644
--- a/python/paddle/nn/functional/input.py
+++ b/python/paddle/nn/functional/input.py
@@ -17,7 +17,7 @@
 from ...static import Variable
 from ...fluid.layer_helper import LayerHelper
 from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle import in_dynamic_mode
 from ...fluid.framework import _in_legacy_dygraph, in_dygraph_mode
 
@@ -88,11 +88,11 @@ def one_hot(x, num_classes, name=None):
     """
 
     if in_dygraph_mode():
-        return _C_ops.final_state_one_hot(x, num_classes)
+        return _C_ops.one_hot(x, num_classes)
     else:
         if _in_legacy_dygraph():
-            return _C_ops.one_hot_v2(x, 'depth', num_classes,
-                                     'allow_out_of_range', False)
+            return _legacy_C_ops.one_hot_v2(x, 'depth', num_classes,
+                                            'allow_out_of_range', False)
         else:
             check_variable_and_dtype(x, 'input', ['int32', 'int64'],
                                      'one_hot_v2')
@@ -118,17 +118,17 @@ def one_hot(x, num_classes, name=None):
 
 def embedding(x, weight, padding_idx=None, sparse=False, name=None):
     r"""
-    The operator is used to lookup embeddings vector of ids provided by :attr:`x` .
+    Used to lookup embeddings vector of ids provided by :attr:`x` .
 
     The shape of output Tensor is generated by appending the last dimension of the input Tensor shape
     with embedding size.
 
-    **Note:** The id in :attr:`x` must satisfy :math:`0 =< id < weight.shape[0]` ,
-    otherwise the program will throw an exception and exit.
+    Note:
+        The id in :attr:`x` must satisfy :math:`0 =< id < weight.shape[0]` ,
+        otherwise the program will throw an exception and exit.
 
     .. code-block:: text
-
-        Case 1:
+    
             x is a Tensor.
                 padding_idx = -1
                 x.data = [[1, 3], [2, 4], [4, 127]]
@@ -151,17 +151,17 @@ def embedding(x, weight, padding_idx=None, sparse=False, name=None):
             satisfy :math:`0<= id < weight.shape[0]` .
         weight (Tensor): The weight. A Tensor with shape of lookup table parameter. It should have two elements which
             indicates the size of the dictionary of embeddings and the size of each embedding vector respectively.
-        sparse(bool): The flag indicating whether to use sparse update. This parameter only
+        sparse(bool, optional): The flag indicating whether to use sparse update. This parameter only
             affects the performance of the backwards gradient update. It is recommended to set
             True because sparse update is faster. But some optimizers does not support sparse update,
             such as :ref:`api_paddle_optimizer_adadelta_Adadelta` , :ref:`api_paddle_optimizer_adamax_Adamax` , :ref:`api_paddle_optimizer_lamb_Lamb`.
             In these cases, sparse must be False. Default: False.
-        padding_idx(int|long|None): padding_idx needs to be in the interval [-weight.shape[0], weight.shape[0]).
+        padding_idx(int|long|None, optional): padding_idx needs to be in the interval [-weight.shape[0], weight.shape[0]).
             If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted
             to :math:`weight.shape[0] + padding\_idx` . It will output all-zero padding data whenever lookup
             encounters :math:`padding\_idx` in id. And the padding data will not be updated while training.
             If set None, it makes no effect to output. Default: None.
-        name(str|None): For detailed information, please refer
+        name(str|None, optional): For detailed information, please refer
            to :ref:`api_guide_Name`. Usually name is no need to set and
            None by default.
 
@@ -171,13 +171,12 @@ def embedding(x, weight, padding_idx=None, sparse=False, name=None):
     Examples:
 
         .. code-block:: python
-
-            import numpy as np
+        
             import paddle
             import paddle.nn as nn
 
-            x0 = np.arange(3, 6).reshape((3, 1)).astype(np.int64)
-            w0 = np.full(shape=(10, 3), fill_value=2).astype(np.float32)
+            x0 = paddle.arange(3, 6).reshape((3, 1)).astype(paddle.int64)
+            w0 = paddle.full(shape=(10, 3), fill_value=2).astype(paddle.float32)
 
             # x.data = [[3], [4], [5]]
             # x.shape = [3, 1]
@@ -201,12 +200,12 @@ def embedding(x, weight, padding_idx=None, sparse=False, name=None):
             weight.shape[0], weight.shape[0]))
 
     if in_dygraph_mode():
-        return _C_ops.final_state_embedding(x, weight, padding_idx, sparse)
+        return _C_ops.embedding(x, weight, padding_idx, sparse)
     elif _in_legacy_dygraph():
-        return _C_ops.lookup_table_v2(weight, x, 'is_sparse', sparse,
-                                      'is_distributed', False,
-                                      'remote_prefetch', False, 'padding_idx',
-                                      padding_idx)
+        return _legacy_C_ops.lookup_table_v2(weight, x, 'is_sparse', sparse,
+                                             'is_distributed', False,
+                                             'remote_prefetch', False,
+                                             'padding_idx', padding_idx)
     else:
         helper = LayerHelper('embedding', **locals())
         dtype = helper.input_dtype(input_param_name='weight')
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 119eb1bfbeed4..7fb4ccb233b2c 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -26,7 +26,7 @@
 from ...fluid.framework import _varbase_creator
 from ...static import Variable
 from paddle.utils import deprecated
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle import in_dynamic_mode
 from paddle.framework import core, _non_static_mode
 from ...fluid.framework import _in_legacy_dygraph, in_dygraph_mode, _non_static_mode, _current_expected_place
@@ -139,7 +139,7 @@ def log_loss(input, label, epsilon=1e-4, name=None):
           cost = F.log_loss(input=prob, label=label)
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_log_loss(input, label, epsilon)
+        return _C_ops.log_loss(input, label, epsilon)
 
     helper = LayerHelper('log_loss', **locals())
     check_variable_and_dtype(input, 'input', ['float32'], 'log_loss')
@@ -260,17 +260,17 @@ def fluid_softmax_with_cross_entropy(logits,
     """
     if _non_static_mode():
         if core.is_compiled_with_npu():
-            softmax, backprop, loss = _C_ops.softmax_with_cross_entropy(
+            softmax, backprop, loss = _legacy_C_ops.softmax_with_cross_entropy(
                 logits, label, 'soft_label', soft_label, 'ignore_index',
                 ignore_index, 'numeric_stable_mode', numeric_stable_mode,
                 'axis', axis)
         else:
             if in_dygraph_mode():
-                softmax, loss = _C_ops.final_state_cross_entropy_with_softmax(
+                softmax, loss = _C_ops.cross_entropy_with_softmax(
                     logits, label, soft_label, True, numeric_stable_mode,
                     ignore_index, axis)
             if _in_legacy_dygraph():
-                softmax, loss = _C_ops.softmax_with_cross_entropy(
+                softmax, loss = _legacy_C_ops.softmax_with_cross_entropy(
                     logits, label, 'soft_label', soft_label, 'ignore_index',
                     ignore_index, 'numeric_stable_mode', numeric_stable_mode,
                     'axis', axis)
@@ -412,12 +412,12 @@ def square_error_cost(input, label):
 
     """
     if in_dygraph_mode():
-        minus_out = _C_ops.final_state_subtract(input, label)
-        square_out = _C_ops.final_state_square(minus_out)
+        minus_out = _C_ops.subtract(input, label)
+        square_out = _C_ops.square(minus_out)
         return square_out
     elif _in_legacy_dygraph():
-        minus_out = _C_ops.elementwise_sub(input, label)
-        square_out = _C_ops.square(minus_out)
+        minus_out = _legacy_C_ops.elementwise_sub(input, label)
+        square_out = _legacy_C_ops.square(minus_out)
         return square_out
 
     check_variable_and_dtype(input, "input", ['float32', 'float64'],
@@ -535,8 +535,8 @@ def edit_distance(input,
         label = erased_label
 
     if in_dygraph_mode():
-        return _C_ops.final_state_edit_distance(input, label, input_length,
-                                                label_length, normalized)
+        return _C_ops.edit_distance(input, label, input_length, label_length,
+                                    normalized)
 
     this_inputs = {"Hyps": [input], "Refs": [label]}
     if input_length is not None and label_length is not None:
@@ -633,27 +633,27 @@ def binary_cross_entropy(input,
             reduction)
 
     if in_dygraph_mode():
-        out = _C_ops.final_state_bce_loss(input, label)
+        out = _C_ops.bce_loss(input, label)
         if weight is not None:
-            out = _C_ops.final_state_multiply(out, weight, 'axis', -1)
+            out = _C_ops.multiply(out, weight, 'axis', -1)
 
         if reduction == 'sum':
-            return _C_ops.final_state_sum(out, [], None, False)
+            return _C_ops.sum(out, [], None, False)
 
         elif reduction == 'mean':
-            return _C_ops.final_state_mean_all(out)
+            return _C_ops.mean_all(out)
         else:
             return out
     else:
         if _in_legacy_dygraph():
-            out = _C_ops.bce_loss(input, label)
+            out = _legacy_C_ops.bce_loss(input, label)
             if weight is not None:
-                out = _C_ops.elementwise_mul(out, weight, 'axis', -1)
+                out = _legacy_C_ops.elementwise_mul(out, weight, 'axis', -1)
             if reduction == 'sum':
-                return _C_ops.reduce_sum(out, 'dim', [0], 'keep_dim', False,
-                                         "reduce_all", True)
+                return _legacy_C_ops.reduce_sum(out, 'dim', [0], 'keep_dim',
+                                                False, "reduce_all", True)
             elif reduction == 'mean':
-                return _C_ops.mean(out)
+                return _legacy_C_ops.mean(out)
             else:
                 return out
         else:
@@ -778,44 +778,41 @@ def binary_cross_entropy_with_logits(logit,
             % reduction)
 
     if in_dygraph_mode():
-        one = _C_ops.final_state_full([1], float(1.0),
-                                      core.VarDesc.VarType.FP32,
-                                      _current_expected_place())
-        out = _C_ops.final_state_sigmoid_cross_entropy_with_logits(
-            logit, label, False, -100)
+        one = _C_ops.full([1], float(1.0), core.VarDesc.VarType.FP32,
+                          _current_expected_place())
+        out = _C_ops.sigmoid_cross_entropy_with_logits(logit, label, False,
+                                                       -100)
         if pos_weight is not None:
-            log_weight = _C_ops.final_state_add(
-                _C_ops.final_state_multiply(
-                    label, _C_ops.final_state_subtract(pos_weight, one)), one)
-            out = _C_ops.final_state_multiply(out, log_weight)
+            log_weight = _C_ops.add(
+                _C_ops.multiply(label, _C_ops.subtract(pos_weight, one)), one)
+            out = _C_ops.multiply(out, log_weight)
         if weight is not None:
-            out = _C_ops.final_state_multiply(out, weight)
+            out = _C_ops.multiply(out, weight)
 
         if reduction == "sum":
-            return _C_ops.final_state_sum(out, [], None, False)
+            return _C_ops.sum(out, [], None, False)
         elif reduction == "mean":
-            return _C_ops.final_state_mean_all(out)
+            return _C_ops.mean_all(out)
         else:
             return out
     elif _in_legacy_dygraph():
         one = _varbase_creator(dtype=logit.dtype)
-        _C_ops.fill_constant(one, 'value', float(1.0), 'force_cpu', False,
-                             'dtype', one.dtype, 'str_value', '1.0', 'shape',
-                             [1])
-        out = _C_ops.sigmoid_cross_entropy_with_logits(logit, label)
+        _legacy_C_ops.fill_constant(one, 'value', float(1.0), 'force_cpu',
+                                    False, 'dtype', one.dtype, 'str_value',
+                                    '1.0', 'shape', [1])
+        out = _legacy_C_ops.sigmoid_cross_entropy_with_logits(logit, label)
         if pos_weight is not None:
-            log_weight = _C_ops.elementwise_add(
-                _C_ops.elementwise_mul(label,
-                                       _C_ops.elementwise_sub(pos_weight, one)),
-                one)
-            out = _C_ops.elementwise_mul(out, log_weight)
+            log_weight = _legacy_C_ops.elementwise_add(
+                _legacy_C_ops.elementwise_mul(
+                    label, _legacy_C_ops.elementwise_sub(pos_weight, one)), one)
+            out = _legacy_C_ops.elementwise_mul(out, log_weight)
         if weight is not None:
-            out = _C_ops.elementwise_mul(out, weight)
+            out = _legacy_C_ops.elementwise_mul(out, weight)
 
         if reduction == "sum":
-            return _C_ops.reduce_sum(out, 'reduce_all', True)
+            return _legacy_C_ops.reduce_sum(out, 'reduce_all', True)
         elif reduction == "mean":
-            return _C_ops.mean(out)
+            return _legacy_C_ops.mean(out)
         else:
             return out
 
@@ -940,16 +937,15 @@ def hsigmoid_loss(input,
             #  [1.92374969]]
     """
     if in_dygraph_mode():
-        out, _, _ = _C_ops.final_state_hierarchical_sigmoid(
-            input, weight, label, path_table, path_code, bias, num_classes,
-            is_sparse, 0, [], [], [], is_sparse)
-        return out
-    elif _in_legacy_dygraph():
         out, _, _ = _C_ops.hierarchical_sigmoid(input, weight, label,
                                                 path_table, path_code, bias,
-                                                'num_classes', num_classes,
-                                                'is_sparse', is_sparse,
-                                                'remote_prefetch', is_sparse)
+                                                num_classes, is_sparse, 0, [],
+                                                [], [], is_sparse)
+        return out
+    elif _in_legacy_dygraph():
+        out, _, _ = _legacy_C_ops.hierarchical_sigmoid(
+            input, weight, label, path_table, path_code, bias, 'num_classes',
+            num_classes, 'is_sparse', is_sparse, 'remote_prefetch', is_sparse)
         return out
 
     check_variable_and_dtype(input, 'input', ['float32', 'float64'],
@@ -1056,7 +1052,7 @@ def smooth_l1_loss(input, label, reduction='mean', delta=1.0, name=None):
                              'smooth_l1_loss')
 
     if in_dygraph_mode():
-        out, residual = _C_ops.final_state_huber_loss(input, label, delta)
+        out, residual = _C_ops.huber_loss(input, label, delta)
     else:
         helper = LayerHelper('huber_loss', **locals())
         residual = helper.create_variable_for_type_inference(
@@ -1139,28 +1135,28 @@ def margin_ranking_loss(input,
             "The value of 'reduction' in MarginRankingLoss should be 'sum', 'mean' or 'none', but "
             "received %s, which is not allowed." % reduction)
     if in_dygraph_mode():
-        out = _C_ops.final_state_subtract(other, input)
-        out = _C_ops.final_state_multiply(out, label)
+        out = _C_ops.subtract(other, input)
+        out = _C_ops.multiply(out, label)
         if margin != 0.0:
             margin = fluid.dygraph.base.to_variable([margin], dtype=out.dtype)
-            out = _C_ops.final_state_add(out, margin)
-        out = _C_ops.final_state_relu(out)
+            out = _C_ops.add(out, margin)
+        out = _C_ops.relu(out)
         if reduction == 'sum':
-            return _C_ops.final_state_sum(out, [], None, False)
+            return _C_ops.sum(out, [], None, False)
         elif reduction == 'mean':
-            return _C_ops.final_state_mean_all(out)
+            return _C_ops.mean_all(out)
         return out
     elif _in_legacy_dygraph():
-        out = _C_ops.elementwise_sub(other, input)
-        out = _C_ops.elementwise_mul(out, label)
+        out = _legacy_C_ops.elementwise_sub(other, input)
+        out = _legacy_C_ops.elementwise_mul(out, label)
         if margin != 0.0:
             margin = fluid.dygraph.base.to_variable([margin], dtype=out.dtype)
-            out = _C_ops.elementwise_add(out, margin)
-        out = _C_ops.relu(out)
+            out = _legacy_C_ops.elementwise_add(out, margin)
+        out = _legacy_C_ops.relu(out)
         if reduction == 'sum':
-            return _C_ops.reduce_sum(out, 'reduce_all', True)
+            return _legacy_C_ops.reduce_sum(out, 'reduce_all', True)
         elif reduction == 'mean':
-            return _C_ops.mean(out)
+            return _legacy_C_ops.mean(out)
         return out
 
     helper = LayerHelper("margin_ranking_loss", **locals())
@@ -1272,9 +1268,9 @@ def l1_loss(input, label, reduction='mean', name=None):
                                                act='abs',
                                                op_name='elementwise_sub')
         if reduction == 'mean':
-            return _C_ops.final_state_mean_all(unreduced)
+            return _C_ops.mean_all(unreduced)
         elif reduction == 'sum':
-            return _C_ops.final_state_sum(unreduced, [], None, False)
+            return _C_ops.sum(unreduced, [], None, False)
         else:
             return unreduced
     elif _in_legacy_dygraph():
@@ -1284,10 +1280,10 @@ def l1_loss(input, label, reduction='mean', name=None):
                                                act='abs',
                                                op_name='elementwise_sub')
         if reduction == 'mean':
-            return _C_ops.mean(unreduced)
+            return _legacy_C_ops.mean(unreduced)
         elif reduction == 'sum':
-            return _C_ops.reduce_sum(unreduced, 'dim', [0], 'keep_dim', False,
-                                     'reduce_all', True)
+            return _legacy_C_ops.reduce_sum(unreduced, 'dim', [0], 'keep_dim',
+                                            False, 'reduce_all', True)
         else:
             return unreduced
 
@@ -1376,25 +1372,26 @@ def nll_loss(input,
     c = input_shape[1]
     if in_dygraph_mode():
         if input_dims != 2 and input_dims != 4:
-            input = _C_ops.final_state_reshape(input, [n, c, 1, -1])
-            label = _C_ops.final_state_reshape(label, [n, 1, -1])
+            input = _C_ops.reshape(input, [n, c, 1, -1])
+            label = _C_ops.reshape(label, [n, 1, -1])
             out_shape = [n] + input_shape[2:]
-        out, total_weight = _C_ops.final_state_nll_loss(input, label, weight,
-                                                        ignore_index, reduction)
+        out, total_weight = _C_ops.nll_loss(input, label, weight, ignore_index,
+                                            reduction)
         if input_dims != 2 and input_dims != 4 and reduction == 'none':
-            out = _C_ops.final_state_reshape(out, out_shape)
+            out = _C_ops.reshape(out, out_shape)
         return out
     elif _in_legacy_dygraph():
         if input_dims != 2 and input_dims != 4:
-            input, _ = _C_ops.reshape2(input, None, 'shape', [n, c, 1, -1])
-            label, _ = _C_ops.reshape2(label, None, 'shape', [n, 1, -1])
+            input, _ = _legacy_C_ops.reshape2(input, None, 'shape',
+                                              [n, c, 1, -1])
+            label, _ = _legacy_C_ops.reshape2(label, None, 'shape', [n, 1, -1])
             out_shape = [n] + input_shape[2:]
 
-        out, total_weight = _C_ops.nll_loss(input, label, weight,
-                                            'ignore_index', ignore_index,
-                                            'reduction', reduction)
+        out, total_weight = _legacy_C_ops.nll_loss(input, label, weight,
+                                                   'ignore_index', ignore_index,
+                                                   'reduction', reduction)
         if input_dims != 2 and input_dims != 4 and reduction == 'none':
-            out, _ = _C_ops.reshape2(out, None, 'shape', out_shape)
+            out, _ = _legacy_C_ops.reshape2(out, None, 'shape', out_shape)
         return out
 
     helper = LayerHelper('nll_loss', **locals())
@@ -1512,7 +1509,7 @@ def kl_div(input, label, reduction='mean', name=None):
         label = paddle.cast(label, 'float64')
 
     if in_dygraph_mode():
-        out = _C_ops.final_state_kldiv_loss(input, label, 'none')
+        out = _C_ops.kldiv_loss(input, label, 'none')
         if reduction == 'mean':
             out = paddle.mean(out)
         elif reduction == 'sum':
@@ -1523,7 +1520,7 @@ def kl_div(input, label, reduction='mean', name=None):
                 out = paddle.sum(out) / batch_size
         return out
     elif _in_legacy_dygraph():
-        out = _C_ops.kldiv_loss(input, label, 'reduction', 'none')
+        out = _legacy_C_ops.kldiv_loss(input, label, 'reduction', 'none')
         if reduction == 'mean':
             out = paddle.mean(out)
         elif reduction == 'sum':
@@ -1951,9 +1948,10 @@ def margin_cross_entropy(logits,
         label = paddle.unsqueeze(label, axis=-1)
 
     if in_dygraph_mode():
-        softmax, loss = _C_ops.final_state_margin_cross_entropy(
-            logits, label, return_softmax, ring_id, rank, nranks, margin1,
-            margin2, margin3, scale)
+        softmax, loss = _C_ops.margin_cross_entropy(logits, label,
+                                                    return_softmax, ring_id,
+                                                    rank, nranks, margin1,
+                                                    margin2, margin3, scale)
         if reduction == 'mean':
             loss = paddle.mean(loss)
         elif reduction == 'sum':
@@ -1963,7 +1961,7 @@ def margin_cross_entropy(logits,
         else:
             return loss, softmax
     elif _in_legacy_dygraph():
-        softmax, loss = _C_ops.margin_cross_entropy(
+        softmax, loss = _legacy_C_ops.margin_cross_entropy(
             logits, label, 'ring_id', ring_id, 'rank', rank, 'nranks', nranks,
             'margin1', margin1, 'margin2', margin2, 'margin3', margin3, 'scale',
             scale, 'return_softmax', return_softmax)
@@ -2324,18 +2322,19 @@ def cross_entropy(input,
                     label_max.item()))
         if core.is_compiled_with_npu() or core.is_compiled_with_mlu():
             if soft_label == False:
-                _, _, out = _C_ops.softmax_with_cross_entropy(
+                _, _, out = _legacy_C_ops.softmax_with_cross_entropy(
                     input, valid_label, 'soft_label', soft_label,
                     'ignore_index', ignore_index, 'numeric_stable_mode', True,
                     'axis', axis, 'use_softmax', use_softmax)
             else:
-                _, _, out = _C_ops.softmax_with_cross_entropy(
+                _, _, out = _legacy_C_ops.softmax_with_cross_entropy(
                     input, label, 'soft_label', soft_label, 'ignore_index',
                     ignore_index, 'numeric_stable_mode', True, 'axis', axis,
                     'use_softmax', use_softmax)
         else:
-            _, out = _C_ops.final_state_cross_entropy_with_softmax(
-                input, label, soft_label, use_softmax, True, ignore_index, axis)
+            _, out = _C_ops.cross_entropy_with_softmax(input, label, soft_label,
+                                                       use_softmax, True,
+                                                       ignore_index, axis)
 
         if weight is not None:
 
@@ -2354,7 +2353,7 @@ def cross_entropy(input,
                 weight_gather_reshape = reshape(weight_gather, shape=out_shape)
                 out = paddle.cast(out, weight_gather_reshape.dtype)
 
-                out = _C_ops.final_state_multiply(out, weight_gather_reshape)
+                out = _C_ops.multiply(out, weight_gather_reshape)
             else:
                 if input.shape[axis] != weight.shape[-1]:
                     raise ValueError(
@@ -2374,24 +2373,23 @@ def cross_entropy(input,
                     temp_perm = list(range(axis % valid_label.ndim)) \
                                 + list(range((axis % valid_label.ndim + 1), valid_label.ndim)) \
                                 + [axis % valid_label.ndim]
-                    weight_gather = _C_ops.final_state_gather_nd(
+                    weight_gather = _C_ops.gather_nd(
                         weight, valid_label.transpose(temp_perm))
                 else:
-                    weight_gather = _C_ops.final_state_gather_nd(
-                        weight, valid_label)
-                weight_gather = _C_ops.final_state_multiply(
-                    weight_gather, ignore_weight_mask)
+                    weight_gather = _C_ops.gather_nd(weight, valid_label)
+                weight_gather = _C_ops.multiply(weight_gather,
+                                                ignore_weight_mask)
                 input_shape = list(label.shape)
                 weight_gather_reshape = reshape(weight_gather,
                                                 shape=input_shape)
                 out = paddle.cast(out, weight_gather_reshape.dtype)
-                out = _C_ops.final_state_multiply(out, weight_gather_reshape)
+                out = _C_ops.multiply(out, weight_gather_reshape)
 
         if reduction == "sum":
             #   because of fluid_softmax_with_cross_entropy op's inner logic,
             #   in the out tensor of this op, the loss of sample with class_index==ignore_index is 0
             #   so, reduce_sum all directly is ok
-            return _C_ops.final_state_sum(out, [], None, False)
+            return _C_ops.sum(out, [], None, False)
         elif reduction == "mean":
             # 1. if weight==none,
             #     numerator: reduce_sum all loss directly is ok causeof fluid_softmax_with_cross_entropy's inner logic
@@ -2400,30 +2398,29 @@ def cross_entropy(input,
             #     numerator: loss's weighted sum
             #     denominator: cal the sum of weight where the sample's class_index!=ignore_index
             if ignore_index >= 0:
-                out_sum = _C_ops.final_state_sum(out, [], None, False)
+                out_sum = _C_ops.sum(out, [], None, False)
                 # for each label[i],set 1 or 0, according to ignore_index
                 # mask[i]=0, if label[i]==ignore_index
                 # mask[i]=1, otherwise
                 mask = (label != ignore_index)
                 if weight is None:
                     mask = paddle.cast(mask, dtype=out_sum.dtype)
-                    count = _C_ops.final_state_sum(mask, [], None, False)
+                    count = _C_ops.sum(mask, [], None, False)
                     ret = out_sum / (count + (count == 0.0))
                 else:
                     mask = paddle.cast(mask, weight_gather_reshape.dtype)
-                    weight_ignored = _C_ops.final_state_multiply(
-                        mask, weight_gather_reshape)
-                    weight_sum = _C_ops.final_state_sum(weight_ignored, [],
-                                                        None, False)
+                    weight_ignored = _C_ops.multiply(mask,
+                                                     weight_gather_reshape)
+                    weight_sum = _C_ops.sum(weight_ignored, [], None, False)
                     ret = out_sum / (weight_sum + (weight_sum == 0.0))
                 return ret
             elif weight is not None:
-                out_sum = _C_ops.final_state_sum(out, [], None, False)
-                total_weight = _C_ops.final_state_sum(weight_gather_reshape, [],
-                                                      None, False)
+                out_sum = _C_ops.sum(out, [], None, False)
+                total_weight = _C_ops.sum(weight_gather_reshape, [], None,
+                                          False)
                 return out_sum / (total_weight + (total_weight == 0.0))
             else:
-                return _C_ops.final_state_mean_all(out)
+                return _C_ops.mean_all(out)
 
         else:
             if input_dims - 1 == label_dims:
@@ -2444,17 +2441,17 @@ def cross_entropy(input,
                     label_max.item()))
         if core.is_compiled_with_npu() or core.is_compiled_with_mlu():
             if soft_label == False:
-                _, _, out = _C_ops.softmax_with_cross_entropy(
+                _, _, out = _legacy_C_ops.softmax_with_cross_entropy(
                     input, valid_label, 'soft_label', soft_label,
                     'ignore_index', ignore_index, 'numeric_stable_mode', True,
                     'axis', axis, 'use_softmax', use_softmax)
             else:
-                _, _, out = _C_ops.softmax_with_cross_entropy(
+                _, _, out = _legacy_C_ops.softmax_with_cross_entropy(
                     input, label, 'soft_label', soft_label, 'ignore_index',
                     ignore_index, 'numeric_stable_mode', True, 'axis', axis,
                     'use_softmax', use_softmax)
         else:
-            _, out = _C_ops.softmax_with_cross_entropy(
+            _, out = _legacy_C_ops.softmax_with_cross_entropy(
                 input, label, 'soft_label', soft_label, 'ignore_index',
                 ignore_index, 'numeric_stable_mode', True, 'axis', axis,
                 'use_softmax', use_softmax)
@@ -2476,7 +2473,7 @@ def cross_entropy(input,
                 weight_gather_reshape = reshape(weight_gather, shape=out_shape)
                 out = paddle.cast(out, weight_gather_reshape.dtype)
 
-                out = _C_ops.elementwise_mul(out, weight_gather_reshape)
+                out = _legacy_C_ops.elementwise_mul(out, weight_gather_reshape)
 
             else:
                 if input.shape[axis] != weight.shape[-1]:
@@ -2497,23 +2494,23 @@ def cross_entropy(input,
                     temp_perm = list(range(axis % valid_label.ndim)) \
                                 + list(range((axis % valid_label.ndim + 1), valid_label.ndim)) \
                                 + [axis % valid_label.ndim]
-                    weight_gather = _C_ops.gather_nd(
+                    weight_gather = _legacy_C_ops.gather_nd(
                         weight, valid_label.transpose(temp_perm))
                 else:
-                    weight_gather = _C_ops.gather_nd(weight, valid_label)
-                weight_gather = _C_ops.elementwise_mul(weight_gather,
-                                                       ignore_weight_mask)
+                    weight_gather = _legacy_C_ops.gather_nd(weight, valid_label)
+                weight_gather = _legacy_C_ops.elementwise_mul(
+                    weight_gather, ignore_weight_mask)
                 input_shape = list(label.shape)
                 weight_gather_reshape = reshape(weight_gather,
                                                 shape=input_shape)
                 out = paddle.cast(out, weight_gather_reshape.dtype)
-                out = _C_ops.elementwise_mul(out, weight_gather_reshape)
+                out = _legacy_C_ops.elementwise_mul(out, weight_gather_reshape)
 
         if reduction == "sum":
             #   because of fluid_softmax_with_cross_entropy op's inner logic,
             #   in the out tensor of this op, the loss of sample with class_index==ignore_index is 0
             #   so, reduce_sum all directly is ok
-            return _C_ops.reduce_sum(out, 'reduce_all', True)
+            return _legacy_C_ops.reduce_sum(out, 'reduce_all', True)
         elif reduction == "mean":
             # 1. if weight==none,
             #     numerator: reduce_sum all loss directly is ok causeof fluid_softmax_with_cross_entropy's inner logic
@@ -2522,30 +2519,30 @@ def cross_entropy(input,
             #     numerator: loss's weighted sum
             #     denominator: cal the sum of weight where the sample's class_index!=ignore_index
             if ignore_index >= 0:
-                out_sum = _C_ops.reduce_sum(out, 'reduce_all', True)
+                out_sum = _legacy_C_ops.reduce_sum(out, 'reduce_all', True)
                 # for each label[i],set 1 or 0, according to ignore_index
                 # mask[i]=0, if label[i]==ignore_index
                 # mask[i]=1, otherwise
                 mask = (label != ignore_index)
                 if weight is None:
                     mask = paddle.cast(mask, dtype=out_sum.dtype)
-                    count = _C_ops.reduce_sum(mask, 'reduce_all', True)
+                    count = _legacy_C_ops.reduce_sum(mask, 'reduce_all', True)
                     ret = out_sum / (count + (count == 0.0))
                 else:
                     mask = paddle.cast(mask, weight_gather_reshape.dtype)
-                    weight_ignored = _C_ops.elementwise_mul(
+                    weight_ignored = _legacy_C_ops.elementwise_mul(
                         mask, weight_gather_reshape)
-                    weight_sum = _C_ops.reduce_sum(weight_ignored, 'reduce_all',
-                                                   True)
+                    weight_sum = _legacy_C_ops.reduce_sum(
+                        weight_ignored, 'reduce_all', True)
                     ret = out_sum / (weight_sum + (weight_sum == 0.0))
                 return ret
             elif weight is not None:
-                out_sum = _C_ops.reduce_sum(out, 'reduce_all', True)
-                total_weight = _C_ops.reduce_sum(weight_gather_reshape,
-                                                 'reduce_all', True)
+                out_sum = _legacy_C_ops.reduce_sum(out, 'reduce_all', True)
+                total_weight = _legacy_C_ops.reduce_sum(weight_gather_reshape,
+                                                        'reduce_all', True)
                 return out_sum / (total_weight + (total_weight == 0.0))
             else:
-                return _C_ops.mean(out)
+                return _legacy_C_ops.mean(out)
         else:
             if input_dims - 1 == label_dims:
                 out = paddle.squeeze(out, axis=axis)
@@ -2753,76 +2750,74 @@ def sigmoid_focal_loss(logit,
 
     if in_dygraph_mode():
         place = _current_expected_place()
-        one = _C_ops.final_state_full(logit.shape, float(1.0), logit.dtype,
-                                      place)
+        one = _C_ops.full(logit.shape, float(1.0), logit.dtype, place)
 
-        loss = _C_ops.final_state_sigmoid_cross_entropy_with_logits(
-            logit, label, False, -100)
+        loss = _C_ops.sigmoid_cross_entropy_with_logits(logit, label, False,
+                                                        -100)
 
-        pred = _C_ops.final_state_sigmoid(logit)
+        pred = _C_ops.sigmoid(logit)
 
-        p_t = _C_ops.final_state_add(
-            _C_ops.final_state_multiply(pred, label),
-            _C_ops.final_state_multiply(_C_ops.final_state_subtract(one, pred),
-                                        _C_ops.final_state_subtract(one,
-                                                                    label)))
+        p_t = _C_ops.add(
+            _C_ops.multiply(pred, label),
+            _C_ops.multiply(_C_ops.subtract(one, pred),
+                            _C_ops.subtract(one, label)))
 
         alpha = fluid.dygraph.base.to_variable([alpha], dtype=loss.dtype)
-        alpha_t = _C_ops.final_state_add(
-            _C_ops.final_state_multiply(alpha, label),
-            _C_ops.final_state_multiply(_C_ops.final_state_subtract(one, alpha),
-                                        _C_ops.final_state_subtract(one,
-                                                                    label)))
-        loss = _C_ops.final_state_multiply(alpha_t, loss)
+        alpha_t = _C_ops.add(
+            _C_ops.multiply(alpha, label),
+            _C_ops.multiply(_C_ops.subtract(one, alpha),
+                            _C_ops.subtract(one, label)))
+        loss = _C_ops.multiply(alpha_t, loss)
 
         gamma = fluid.dygraph.base.to_variable([gamma], dtype=loss.dtype)
-        gamma_t = _C_ops.final_state_pow(_C_ops.final_state_subtract(one, p_t),
-                                         gamma)
-        loss = _C_ops.final_state_multiply(gamma_t, loss)
+        gamma_t = _C_ops.pow(_C_ops.subtract(one, p_t), gamma)
+        loss = _C_ops.multiply(gamma_t, loss)
 
         if normalizer is not None:
-            loss = _C_ops.final_state_divide(loss, normalizer)
+            loss = _C_ops.divide(loss, normalizer)
 
         if reduction == "sum":
-            return _C_ops.final_state_sum(loss, [], None, False)
+            return _C_ops.sum(loss, [], None, False)
         elif reduction == "mean":
-            return _C_ops.final_state_mean_all(loss)
+            return _C_ops.mean_all(loss)
 
         return loss
 
     elif _in_legacy_dygraph():
         one = _varbase_creator(dtype=logit.dtype)
-        _C_ops.fill_constant(one, 'value', float(1.0), 'force_cpu', False,
-                             'dtype', one.dtype, 'str_value', '1.0', 'shape',
-                             logit.shape)
-        loss = _C_ops.sigmoid_cross_entropy_with_logits(logit, label)
+        _legacy_C_ops.fill_constant(one, 'value', float(1.0), 'force_cpu',
+                                    False, 'dtype', one.dtype, 'str_value',
+                                    '1.0', 'shape', logit.shape)
+        loss = _legacy_C_ops.sigmoid_cross_entropy_with_logits(logit, label)
 
-        pred = _C_ops.sigmoid(logit)
+        pred = _legacy_C_ops.sigmoid(logit)
 
-        p_t = _C_ops.elementwise_add(
-            _C_ops.elementwise_mul(pred, label),
-            _C_ops.elementwise_mul(_C_ops.elementwise_sub(one, pred),
-                                   _C_ops.elementwise_sub(one, label)))
+        p_t = _legacy_C_ops.elementwise_add(
+            _legacy_C_ops.elementwise_mul(pred, label),
+            _legacy_C_ops.elementwise_mul(
+                _legacy_C_ops.elementwise_sub(one, pred),
+                _legacy_C_ops.elementwise_sub(one, label)))
 
         alpha = fluid.dygraph.base.to_variable([alpha], dtype=loss.dtype)
-        alpha_t = _C_ops.elementwise_add(
-            _C_ops.elementwise_mul(alpha, label),
-            _C_ops.elementwise_mul(_C_ops.elementwise_sub(one, alpha),
-                                   _C_ops.elementwise_sub(one, label)))
-        loss = _C_ops.elementwise_mul(alpha_t, loss)
+        alpha_t = _legacy_C_ops.elementwise_add(
+            _legacy_C_ops.elementwise_mul(alpha, label),
+            _legacy_C_ops.elementwise_mul(
+                _legacy_C_ops.elementwise_sub(one, alpha),
+                _legacy_C_ops.elementwise_sub(one, label)))
+        loss = _legacy_C_ops.elementwise_mul(alpha_t, loss)
 
         gamma = fluid.dygraph.base.to_variable([gamma], dtype=loss.dtype)
-        gamma_t = _C_ops.elementwise_pow(_C_ops.elementwise_sub(one, p_t),
-                                         gamma)
-        loss = _C_ops.elementwise_mul(gamma_t, loss)
+        gamma_t = _legacy_C_ops.elementwise_pow(
+            _legacy_C_ops.elementwise_sub(one, p_t), gamma)
+        loss = _legacy_C_ops.elementwise_mul(gamma_t, loss)
 
         if normalizer is not None:
-            loss = _C_ops.elementwise_div(loss, normalizer)
+            loss = _legacy_C_ops.elementwise_div(loss, normalizer)
 
         if reduction == "sum":
-            return _C_ops.reduce_sum(loss, 'reduce_all', True)
+            return _legacy_C_ops.reduce_sum(loss, 'reduce_all', True)
         elif reduction == "mean":
-            return _C_ops.mean(loss)
+            return _legacy_C_ops.mean(loss)
 
         return loss
 
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index f9c7b5f8e6b17..03ba72fdda344 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -22,7 +22,7 @@
 from ...framework import ParamAttr
 from ...fluid import dygraph_utils
 import numbers
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle import in_dynamic_mode
 from paddle.fluid.framework import core, _non_static_mode, in_dygraph_mode, _in_legacy_dygraph
 
@@ -81,14 +81,14 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
     """
     if in_dygraph_mode():
         eps = fluid.dygraph.base.to_variable([epsilon], dtype=x.dtype)
-        out = _C_ops.final_state_p_norm(x, float(p), axis, epsilon, True, False)
-        return x / _C_ops.final_state_maximum(out, eps)
+        out = _C_ops.p_norm(x, float(p), axis, epsilon, True, False)
+        return x / _C_ops.maximum(out, eps)
 
     if _in_legacy_dygraph():
         eps = fluid.dygraph.base.to_variable([epsilon], dtype=x.dtype)
-        out = _C_ops.p_norm(x, 'axis', axis, 'porder', float(p), 'keepdim',
-                            True, 'epsilon', epsilon)
-        return x / _C_ops.elementwise_max(out, eps)
+        out = _legacy_C_ops.p_norm(x, 'axis', axis, 'porder', float(p),
+                                   'keepdim', True, 'epsilon', epsilon)
+        return x / _legacy_C_ops.elementwise_max(out, eps)
 
     check_type(p, 'p', (float, int), 'normalize')
     check_type(axis, 'axis', (int), 'normalize')
@@ -189,7 +189,7 @@ def batch_norm(x,
         trainable_statistics = not use_global_stats
 
     if in_dygraph_mode():
-        batch_norm_out, _, _, _, _, _ = _C_ops.final_state_batch_norm(
+        batch_norm_out, _, _, _, _, _ = _C_ops.batch_norm(
             x, weight, bias, running_mean, running_var, momentum, epsilon,
             data_format, not training, use_global_stats, trainable_statistics,
             False)
@@ -204,7 +204,7 @@ def batch_norm(x,
                  "fuse_with_relu", False, "use_global_stats", use_global_stats,
                  "trainable_statistics", trainable_statistics)
 
-        batch_norm_out, _, _, _, _, _ = _C_ops.batch_norm(
+        batch_norm_out, _, _, _, _, _ = _legacy_C_ops.batch_norm(
             x, weight, bias, running_mean, running_var, None, mean_out,
             variance_out, *attrs)
 
@@ -323,14 +323,15 @@ def layer_norm(x,
                          str(input_shape))
 
     if in_dygraph_mode():
-        pre_act, _, _, = _C_ops.final_state_layer_norm(x, weight, bias, epsilon,
-                                                       begin_norm_axis, False)
+        pre_act, _, _, = _C_ops.layer_norm(x, weight, bias, epsilon,
+                                           begin_norm_axis, False)
 
         return dygraph_utils._append_activation_in_dygraph(pre_act, act=None)
 
     if _in_legacy_dygraph():
-        pre_act, _, _ = _C_ops.layer_norm(x, weight, bias, 'epsilon', epsilon,
-                                          'begin_norm_axis', begin_norm_axis)
+        pre_act, _, _ = _legacy_C_ops.layer_norm(x, weight, bias, 'epsilon',
+                                                 epsilon, 'begin_norm_axis',
+                                                 begin_norm_axis)
         return dygraph_utils._append_activation_in_dygraph(pre_act, act=None)
 
     check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
@@ -413,12 +414,12 @@ def instance_norm(x,
 
     """
     if in_dygraph_mode():
-        out = _C_ops.final_state_instance_norm(x, weight, bias, eps)
+        out = _C_ops.instance_norm(x, weight, bias, eps)
         return out
     if _in_legacy_dygraph():
-        out, _, _ = _C_ops.instance_norm(x, weight, bias, "epsilon", eps,
-                                         "momentum", momentum, "data_format",
-                                         data_format)
+        out, _, _ = _legacy_C_ops.instance_norm(x, weight, bias, "epsilon", eps,
+                                                "momentum", momentum,
+                                                "data_format", data_format)
         return out
 
     check_variable_and_dtype(x, 'input', ['float32', 'float64'], "InstanceNorm")
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index 21e2aafe916f0..0aa0a8420831a 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -16,7 +16,7 @@
 from ...fluid.layers import utils, LayerHelper
 from ...tensor.manipulation import unsqueeze, squeeze
 from ...fluid.data_feeder import check_type, check_variable_and_dtype
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle import in_dynamic_mode
 from paddle.fluid.framework import _in_legacy_dygraph
 from paddle.fluid.framework import in_dygraph_mode
@@ -192,23 +192,16 @@ def avg_pool1d(x,
     Returns:
         Tensor: The output tensor of pooling result. The data type is same as input tensor.
 
-    Raises:
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ValueError: If `padding` is a list or tuple but its length is greater than 1.
-        ShapeError: If the input is not a 3-D tensor.
-        ShapeError: If the output's shape calculated is not greater than 0.
-
     Examples:
         .. code-block:: python
           
             import paddle
-            import paddle.nn.functional as F
-            import numpy as np
+            import paddle.nn as nn
 
-            data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-            out = F.avg_pool1d(data, kernel_size=2, stride=2, padding=0)
-            # out shape: [1, 3, 16]
+            data = paddle.uniform([1, 3, 32], paddle.float32)
+            AvgPool1D = nn.AvgPool1D(kernel_size=2, stride=2, padding=0)
+            pool_out = AvgPool1D(data)
+            # pool_out shape: [1, 3, 16]
     """
     """NCL to NCHW"""
     data_format = "NCHW"
@@ -237,13 +230,13 @@ def avg_pool1d(x,
     padding = _expand_low_nd_padding(padding)
 
     if in_dynamic_mode():
-        output = _C_ops.pool2d(x, 'pooling_type', 'avg', 'ksize', kernel_size,
-                               'global_pooling', False, 'strides', stride,
-                               'paddings', padding, 'padding_algorithm',
-                               padding_algorithm, 'use_cudnn', True,
-                               'ceil_mode', ceil_mode, 'use_mkldnn', False,
-                               'exclusive', exclusive, 'data_format',
-                               data_format)
+        output = _legacy_C_ops.pool2d(x, 'pooling_type', 'avg', 'ksize',
+                                      kernel_size, 'global_pooling', False,
+                                      'strides', stride, 'paddings', padding,
+                                      'padding_algorithm', padding_algorithm,
+                                      'use_cudnn', True, 'ceil_mode', ceil_mode,
+                                      'use_mkldnn', False, 'exclusive',
+                                      exclusive, 'data_format', data_format)
         return squeeze(output, [2])
 
     op_type = 'pool2d'
@@ -318,20 +311,14 @@ def avg_pool2d(x,
     Returns:
         Tensor: The output tensor of pooling result. The data type is same as input tensor.
     
-    Raises:
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ShapeError: If the output's shape calculated is not greater than 0.
-    
     Examples:
         .. code-block:: python
           
             import paddle
             import paddle.nn.functional as F
-            import numpy as np
             
             # avg pool2d
-            x = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
+            x = paddle.uniform([1, 3, 32, 32], paddle.float32)
             out = F.avg_pool2d(x,
                             kernel_size=2,
                             stride=2, padding=0)
@@ -354,18 +341,16 @@ def avg_pool2d(x,
 
     if in_dygraph_mode() or _in_legacy_dygraph():
         if in_dygraph_mode():
-            output = _C_ops.final_state_pool2d(x, kernel_size, stride, padding,
-                                               ceil_mode, exclusive,
-                                               data_format, 'avg', False, False,
-                                               padding_algorithm)
+            output = _C_ops.pool2d(x, kernel_size, stride, padding, ceil_mode,
+                                   exclusive, data_format, 'avg', False, False,
+                                   padding_algorithm)
         else:
-            output = _C_ops.pool2d(x, 'pooling_type', 'avg', 'ksize',
-                                   kernel_size, 'global_pooling', False,
-                                   'padding_algorithm', padding_algorithm,
-                                   'strides', stride, 'paddings', padding,
-                                   'use_cudnn', True, 'ceil_mode', ceil_mode,
-                                   'use_mkldnn', False, 'exclusive', exclusive,
-                                   'data_format', data_format)
+            output = _legacy_C_ops.pool2d(
+                x, 'pooling_type', 'avg', 'ksize', kernel_size,
+                'global_pooling', False, 'padding_algorithm', padding_algorithm,
+                'strides', stride, 'paddings', padding, 'use_cudnn', True,
+                'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive',
+                exclusive, 'data_format', data_format)
         if divisor_override is None:
             return output
         else:
@@ -446,19 +431,13 @@ def avg_pool3d(x,
     
     Returns:
         Tensor: The output tensor of pooling result. The data type is same as input tensor.
-    
-    Raises:
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ShapeError: If the output's shape calculated is not greater than 0.
-    
+
     Examples:
         .. code-block:: python
           
           import paddle
-          import numpy as np
 
-          x = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
+          x = paddle.uniform([1, 3, 32, 32, 32], paddle.float32)
           # avg pool3d
           out = paddle.nn.functional.avg_pool3d(
                                             x,
@@ -484,18 +463,16 @@ def avg_pool3d(x,
 
     if in_dygraph_mode() or _in_legacy_dygraph():
         if in_dygraph_mode():
-            output = _C_ops.final_state_pool3d(x, kernel_size, stride, padding,
-                                               ceil_mode, exclusive,
-                                               data_format, 'avg', False, False,
-                                               padding_algorithm)
+            output = _C_ops.pool3d(x, kernel_size, stride, padding, ceil_mode,
+                                   exclusive, data_format, 'avg', False, False,
+                                   padding_algorithm)
         if _in_legacy_dygraph():
-            output = _C_ops.pool3d(x, 'pooling_type', 'avg', 'ksize',
-                                   kernel_size, 'strides', stride, 'paddings',
-                                   padding, 'global_pooling', False,
-                                   'padding_algorithm', padding_algorithm,
-                                   'use_cudnn', True, 'ceil_mode', ceil_mode,
-                                   'use_mkldnn', False, 'exclusive', exclusive,
-                                   'data_format', data_format)
+            output = _legacy_C_ops.pool3d(
+                x, 'pooling_type', 'avg', 'ksize', kernel_size, 'strides',
+                stride, 'paddings', padding, 'global_pooling', False,
+                'padding_algorithm', padding_algorithm, 'use_cudnn', True,
+                'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive',
+                exclusive, 'data_format', data_format)
         if divisor_override is None:
             return output
         else:
@@ -581,9 +558,8 @@ def max_pool1d(x,
 
           import paddle
           import paddle.nn.functional as F
-          import numpy as np
 
-          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+          data = paddle.uniform([1, 3, 32], paddle.float32)
           pool_out = F.max_pool1d(data, kernel_size=2, stride=2, padding=0)
           # pool_out shape: [1, 3, 16]
           pool_out, indices = F.max_pool1d(data, kernel_size=2, stride=2, padding=0, return_mask=True)
@@ -610,21 +586,20 @@ def max_pool1d(x,
 
     if in_dygraph_mode():
         if return_mask:
-            pool_out = _C_ops.final_state_max_pool2d_with_index(
-                x, kernel_size, stride, padding, False, False)
+            pool_out = _C_ops.max_pool2d_with_index(x, kernel_size, stride,
+                                                    padding, False, False)
             return (squeeze(pool_out[0], [2]),
                     squeeze(pool_out[1], [2])) if return_mask else squeeze(
                         pool_out[0], [2])
         else:
-            pool_out = _C_ops.final_state_pool2d(x, kernel_size, stride,
-                                                 padding, ceil_mode, True,
-                                                 data_format, 'max', False,
-                                                 False, padding_algorithm)
+            pool_out = _C_ops.pool2d(x, kernel_size, stride, padding, ceil_mode,
+                                     True, data_format, 'max', False, False,
+                                     padding_algorithm)
             return squeeze(pool_out, [2])
 
     if _in_legacy_dygraph():
         if return_mask:
-            pool_out = _C_ops.max_pool2d_with_index(
+            pool_out = _legacy_C_ops.max_pool2d_with_index(
                 x, 'ksize', kernel_size, 'global_pooling', False, 'strides',
                 stride, 'paddings', padding, 'padding_algorithm',
                 padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
@@ -634,13 +609,12 @@ def max_pool1d(x,
                     squeeze(pool_out[1], [2])) if return_mask else squeeze(
                         pool_out[0], [2])
         else:
-            pool_out = _C_ops.pool2d(x, 'pooling_type', 'max', 'ksize',
-                                     kernel_size, 'global_pooling', False,
-                                     'padding_algorithm', padding_algorithm,
-                                     'strides', stride, 'paddings', padding,
-                                     'use_cudnn', True, 'ceil_mode', ceil_mode,
-                                     'use_mkldnn', False, 'exclusive', True,
-                                     'data_format', data_format)
+            pool_out = _legacy_C_ops.pool2d(
+                x, 'pooling_type', 'max', 'ksize', kernel_size,
+                'global_pooling', False, 'padding_algorithm', padding_algorithm,
+                'strides', stride, 'paddings', padding, 'use_cudnn', True,
+                'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive', True,
+                'data_format', data_format)
             return squeeze(pool_out, [2])
 
     op_type = 'max_pool2d_with_index' if return_mask else "pool2d"
@@ -783,14 +757,14 @@ def max_unpool1d(x,
                                       output_size)
 
     if in_dygraph_mode():
-        output = _C_ops.final_state_unpool(x, indices, kernel_size, stride,
-                                           padding, output_size, data_format)
+        output = _C_ops.unpool(x, indices, kernel_size, stride, padding,
+                               output_size, data_format)
         return squeeze(output, [2])
     elif in_dynamic_mode():
-        output = _C_ops.unpool(x, indices, 'unpooling_type', 'max', 'ksize',
-                               kernel_size, 'strides', stride, 'paddings',
-                               padding, "output_size", output_size,
-                               "data_format", data_format)
+        output = _legacy_C_ops.unpool(x, indices, 'unpooling_type', 'max',
+                                      'ksize', kernel_size, 'strides', stride,
+                                      'paddings', padding, "output_size",
+                                      output_size, "data_format", data_format)
         return squeeze(output, [2])
 
     op_type = "unpool"
@@ -902,14 +876,14 @@ def max_unpool2d(x,
                                       output_size)
 
     if in_dygraph_mode():
-        output = _C_ops.final_state_unpool(x, indices, kernel_size, stride,
-                                           padding, output_size, data_format)
+        output = _C_ops.unpool(x, indices, kernel_size, stride, padding,
+                               output_size, data_format)
 
     elif in_dynamic_mode():
-        output = _C_ops.unpool(x, indices, 'unpooling_type', 'max', 'ksize',
-                               kernel_size, 'strides', stride, 'paddings',
-                               padding, "output_size", output_size,
-                               "data_format", data_format)
+        output = _legacy_C_ops.unpool(x, indices, 'unpooling_type', 'max',
+                                      'ksize', kernel_size, 'strides', stride,
+                                      'paddings', padding, "output_size",
+                                      output_size, "data_format", data_format)
         return output
 
     op_type = "unpool"
@@ -1019,13 +993,13 @@ def max_unpool3d(x,
                                       output_size)
 
     if in_dygraph_mode():
-        output = _C_ops.final_state_unpool3d(x, indices, kernel_size, stride,
-                                             padding, output_size, data_format)
+        output = _C_ops.unpool3d(x, indices, kernel_size, stride, padding,
+                                 output_size, data_format)
     elif in_dynamic_mode():
-        output = _C_ops.unpool3d(x, indices, 'unpooling_type', 'max', 'ksize',
-                                 kernel_size, 'strides', stride, 'paddings',
-                                 padding, "output_size", output_size,
-                                 "data_format", data_format)
+        output = _legacy_C_ops.unpool3d(x, indices, 'unpooling_type', 'max',
+                                        'ksize', kernel_size, 'strides', stride,
+                                        'paddings', padding, "output_size",
+                                        output_size, "data_format", data_format)
         return output
 
     op_type = "unpool3d"
@@ -1140,18 +1114,17 @@ def max_pool2d(x,
 
     if in_dygraph_mode():
         if return_mask:
-            output = _C_ops.final_state_max_pool2d_with_index(
-                x, kernel_size, stride, padding, False, False)
+            output = _C_ops.max_pool2d_with_index(x, kernel_size, stride,
+                                                  padding, False, False)
             return output if return_mask else output[0]
         else:
-            return _C_ops.final_state_pool2d(x, kernel_size, stride, padding,
-                                             ceil_mode, True, data_format,
-                                             'max', False, False,
-                                             padding_algorithm)
+            return _C_ops.pool2d(x, kernel_size, stride, padding, ceil_mode,
+                                 True, data_format, 'max', False, False,
+                                 padding_algorithm)
 
     if _in_legacy_dygraph():
         if return_mask:
-            output = _C_ops.max_pool2d_with_index(
+            output = _legacy_C_ops.max_pool2d_with_index(
                 x, 'ksize', kernel_size, 'global_pooling', False, 'strides',
                 stride, 'paddings', padding, 'padding_algorithm',
                 padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
@@ -1159,13 +1132,12 @@ def max_pool2d(x,
                 data_format)
             return output if return_mask else output[0]
         else:
-            output = _C_ops.pool2d(x, 'pooling_type', 'max', 'ksize',
-                                   kernel_size, 'global_pooling', False,
-                                   'padding_algorithm', padding_algorithm,
-                                   'strides', stride, 'paddings', padding,
-                                   'use_cudnn', True, 'ceil_mode', ceil_mode,
-                                   'use_mkldnn', False, 'exclusive', True,
-                                   'data_format', data_format)
+            output = _legacy_C_ops.pool2d(
+                x, 'pooling_type', 'max', 'ksize', kernel_size,
+                'global_pooling', False, 'padding_algorithm', padding_algorithm,
+                'strides', stride, 'paddings', padding, 'use_cudnn', True,
+                'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive', True,
+                'data_format', data_format)
             return output
 
     op_type = 'max_pool2d_with_index' if return_mask else "pool2d"
@@ -1283,18 +1255,17 @@ def max_pool3d(x,
 
     if in_dygraph_mode():
         if return_mask:
-            output = _C_ops.final_state_max_pool3d_with_index(
-                x, kernel_size, stride, padding, False, False)
+            output = _C_ops.max_pool3d_with_index(x, kernel_size, stride,
+                                                  padding, False, False)
             return output if return_mask else output[0]
         else:
-            return _C_ops.final_state_pool3d(x, kernel_size, stride, padding,
-                                             ceil_mode, True, data_format,
-                                             'max', False, False,
-                                             padding_algorithm)
+            return _C_ops.pool3d(x, kernel_size, stride, padding, ceil_mode,
+                                 True, data_format, 'max', False, False,
+                                 padding_algorithm)
 
     if _in_legacy_dygraph():
         if return_mask:
-            output = _C_ops.max_pool3d_with_index(
+            output = _legacy_C_ops.max_pool3d_with_index(
                 x, 'pooling_type', 'max', 'ksize', kernel_size, 'strides',
                 stride, 'paddings', padding, 'global_pooling', False,
                 'padding_algorithm', padding_algorithm, 'use_cudnn', True,
@@ -1302,13 +1273,12 @@ def max_pool3d(x,
                 'data_format', data_format)
             return output if return_mask else output[0]
         else:
-            output = _C_ops.pool3d(x, 'pooling_type', 'max', 'ksize',
-                                   kernel_size, 'global_pooling', False,
-                                   'padding_algorithm', padding_algorithm,
-                                   'strides', stride, 'paddings', padding,
-                                   'use_cudnn', True, 'ceil_mode', ceil_mode,
-                                   'use_mkldnn', False, 'exclusive', True,
-                                   'data_format', data_format)
+            output = _legacy_C_ops.pool3d(
+                x, 'pooling_type', 'max', 'ksize', kernel_size,
+                'global_pooling', False, 'padding_algorithm', padding_algorithm,
+                'strides', stride, 'paddings', padding, 'use_cudnn', True,
+                'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive', True,
+                'data_format', data_format)
             return output
 
     op_type = "max_pool3d_with_index" if return_mask else "pool3d"
@@ -1350,8 +1320,10 @@ def adaptive_avg_pool1d(x, output_size, name=None):
         x (Tensor): The input Tensor of pooling, which is a 3-D tensor with shape :math:`[N, C, L]`, where :math:`N` is batch size, :math:`C` is the number of channels and :math:`L` is the length of the feature. The data type is float32 or float64.
         output_size (int): The target output size. Its data type must be int.
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
+    
     Returns:
         Tensor: The result of 1D adaptive average pooling. Its data type is same as input.
+    
     Examples:
         .. code-block:: python
 
@@ -1384,8 +1356,8 @@ def adaptive_avg_pool1d(x, output_size, name=None):
 
     x = unsqueeze(x, [2])
     if in_dynamic_mode():
-        pool_out = _C_ops.pool2d(x, 'pooling_type', pool_type, 'ksize',
-                                 pool_size, 'adaptive', True)
+        pool_out = _legacy_C_ops.pool2d(x, 'pooling_type', pool_type, 'ksize',
+                                        pool_size, 'adaptive', True)
         return squeeze(pool_out, [2])
 
     l_type = "pool2d"
@@ -1409,8 +1381,16 @@ def adaptive_avg_pool1d(x, output_size, name=None):
 
 def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
     """
-    This API implements adaptive average pooling 2d operation.
-    See more details in :ref:`api_nn_pooling_AdaptiveAvgPool2d` .
+    Applies 2D adaptive avg pooling on input tensor. The h and w dimensions
+    of the output tensor are determined by the parameter output_size.
+    
+    For avg adaptive pool2d:
+    ..  math::
+        hstart &= floor(i * H_{in} / H_{out})
+        hend &= ceil((i + 1) * H_{in} / H_{out})
+        wstart &= floor(j * W_{in} / W_{out})
+        wend &= ceil((j + 1) * W_{in} / W_{out})
+        Output(i ,j) &= \frac{\sum Input[hstart:hend, wstart:wend]}{(hend - hstart) * (wend - wstart)}
 
     Args:
         x (Tensor): The input tensor of adaptive avg pool2d operator, which is a 4-D tensor.
@@ -1426,8 +1406,7 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
                              None by default.
     Returns:
         Tensor: The output tensor of avg adaptive pool2d result. The data type is same as input tensor.
-    Raises:
-        ValueError: If `data_format` is not "NCHW" or "NHWC".
+
     Examples:
         .. code-block:: python
 
@@ -1482,15 +1461,15 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
             output_size[1] = in_w
 
     if in_dygraph_mode():
-        return _C_ops.final_state_pool2d_gpudnn_unused(x, output_size, [1, 1],
-                                                       [0, 0], False, True,
-                                                       data_format, 'avg',
-                                                       False, True, "EXPLICIT")
+        return _C_ops.pool2d_gpudnn_unused(x, output_size, [1, 1], [0, 0],
+                                           False, True, data_format, 'avg',
+                                           False, True, "EXPLICIT")
 
     if _in_legacy_dygraph():
-        return _C_ops.pool2d(x, 'pooling_type', 'avg', 'ksize', output_size,
-                             'global_pooling', False, 'adaptive', True,
-                             'data_format', data_format)
+        return _legacy_C_ops.pool2d(x, 'pooling_type', 'avg', 'ksize',
+                                    output_size, 'global_pooling', False,
+                                    'adaptive', True, 'data_format',
+                                    data_format)
 
     l_type = 'pool2d'
 
@@ -1515,8 +1494,19 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
 
 def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
     """
-    This API implements adaptive average pooling 3d operation.
-    See more details in :ref:`api_nn_pooling_AdaptiveAvgPool3d` .
+    This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions
+    of the output tensor are determined by the parameter output_size.
+    
+    For avg adaptive pool3d:
+    ..  math::
+        dstart &= floor(i * D_{in} / D_{out})
+        dend &= ceil((i + 1) * D_{in} / D_{out})
+        hstart &= floor(j * H_{in} / H_{out})
+        hend &= ceil((j + 1) * H_{in} / H_{out})
+        wstart &= floor(k * W_{in} / W_{out})
+        wend &= ceil((k + 1) * W_{in} / W_{out})
+        Output(i ,j, k) &= \frac{\sum Input[dstart:dend, hstart:hend, wstart:wend]}
+            {(dend - dstart) * (hend - hstart) * (wend - wstart)}
 
     Args:
         x (Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor.
@@ -1532,8 +1522,7 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
                              None by default.
     Returns:
         Tensor: The output tensor of avg adaptive pool3d result. The data type is same as input tensor.
-    Raises:
-        ValueError: If `data_format` is not "NCDHW" or "NDHWC".
+
     Examples:
         .. code-block:: python
 
@@ -1556,12 +1545,10 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
             #                 output[:, :, i, j, k] =
             #                     avg(input[:, :, dstart:dend, hstart: hend, wstart: wend])
             import paddle
-            import numpy as np
-            input_data = np.random.rand(2, 3, 8, 32, 32)
-            x = paddle.to_tensor(input_data)
-            # x.shape is [2, 3, 8, 32, 32]
+
+            input_data = paddle.randn(shape=(2, 3, 8, 32, 32))
             out = paddle.nn.functional.adaptive_avg_pool3d(
-                            x = x,
+                            x = input_data,
                             output_size=[3, 3, 3])
             # out.shape is [2, 3, 3, 3, 3]
     """
@@ -1592,9 +1579,10 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
             output_size[2] = in_w
 
     if in_dynamic_mode():
-        return _C_ops.pool3d(x, 'pooling_type', 'avg', 'ksize', output_size,
-                             'global_pooling', False, 'adaptive', True,
-                             'data_format', data_format)
+        return _legacy_C_ops.pool3d(x, 'pooling_type', 'avg', 'ksize',
+                                    output_size, 'global_pooling', False,
+                                    'adaptive', True, 'data_format',
+                                    data_format)
 
     l_type = 'pool3d'
 
@@ -1654,9 +1642,8 @@ def adaptive_max_pool1d(x, output_size, return_mask=False, name=None):
               #
               import paddle
               import paddle.nn.functional as F
-              import numpy as np
 
-              data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+              data = paddle.uniform([1, 3, 32], paddle.float32)
               pool_out = F.adaptive_max_pool1d(data, output_size=16)
               # pool_out shape: [1, 3, 16])
               pool_out, indices = F.adaptive_max_pool1d(data, output_size=16, return_mask=True)
@@ -1674,14 +1661,15 @@ def adaptive_max_pool1d(x, output_size, return_mask=False, name=None):
 
     x = unsqueeze(x, [2])
     if in_dygraph_mode():
-        pool_out = _C_ops.final_state_max_pool2d_with_index(
-            x, pool_size, [1, 1], [0, 0], False, True)
+        pool_out = _C_ops.max_pool2d_with_index(x, pool_size, [1, 1], [0, 0],
+                                                False, True)
         return (squeeze(pool_out[0], [2]), squeeze(
             pool_out[1], [2])) if return_mask else squeeze(pool_out[0], [2])
     if _in_legacy_dygraph():
-        pool_out = _C_ops.max_pool2d_with_index(x, 'pooling_type', pool_type,
-                                                'ksize', pool_size, 'adaptive',
-                                                True)
+        pool_out = _legacy_C_ops.max_pool2d_with_index(x, 'pooling_type',
+                                                       pool_type, 'ksize',
+                                                       pool_size, 'adaptive',
+                                                       True)
         return (squeeze(pool_out[0], [2]), squeeze(
             pool_out[1], [2])) if return_mask else squeeze(pool_out[0], [2])
 
@@ -1740,13 +1728,10 @@ def adaptive_max_pool2d(x, output_size, return_mask=False, name=None):
               #             output[:, :, i, j] = max(input[:, :, hstart: hend, wstart: wend])
               #
               import paddle
-              import numpy as np
 
-              input_data = np.random.rand(2, 3, 32, 32)
-              x = paddle.to_tensor(input_data)
-              # x.shape is [2, 3, 32, 32]
+              input_data = paddle.randn(shape=(2, 3, 32, 32))
               out = paddle.nn.functional.adaptive_max_pool2d(
-                            x = x,
+                            x = input_data,
                             output_size=[3, 3])
               # out.shape is [2, 3, 3, 3]
     """
@@ -1767,13 +1752,13 @@ def adaptive_max_pool2d(x, output_size, return_mask=False, name=None):
         if output_size[1] == None:
             output_size[1] = in_w
     if in_dygraph_mode():
-        pool_out = _C_ops.final_state_max_pool2d_with_index(
-            x, output_size, [1, 1], [0, 0], False, True)
+        pool_out = _C_ops.max_pool2d_with_index(x, output_size, [1, 1], [0, 0],
+                                                False, True)
         return pool_out if return_mask else pool_out[0]
     if _in_legacy_dygraph():
-        pool_out = _C_ops.max_pool2d_with_index(x, 'pooling_type', 'max',
-                                                'ksize', output_size,
-                                                'adaptive', True)
+        pool_out = _legacy_C_ops.max_pool2d_with_index(x, 'pooling_type', 'max',
+                                                       'ksize', output_size,
+                                                       'adaptive', True)
         return pool_out if return_mask else pool_out[0]
 
     l_type = 'max_pool2d_with_index'
@@ -1833,13 +1818,10 @@ def adaptive_max_pool3d(x, output_size, return_mask=False, name=None):
               #             output[:, :, i, j, k] = max(input[:, :, dstart: dend, hstart: hend, wstart: wend])
               #
               import paddle
-              import numpy as np
 
-              input_data = np.random.rand(2, 3, 8, 32, 32)
-              x = paddle.to_tensor(input_data)
-              # x.shape is [2, 3, 8, 32, 32]
+              input_data = paddle.randn(shape=(2, 3, 8, 32, 32))
               out = paddle.nn.functional.adaptive_max_pool3d(
-                            x = x,
+                            x = input_data,
                             output_size=[3, 3, 3])
               # out.shape is [2, 3, 3, 3, 3]
     """
@@ -1864,9 +1846,9 @@ def adaptive_max_pool3d(x, output_size, return_mask=False, name=None):
             output_size[2] = in_w
 
     if in_dynamic_mode():
-        pool_out = _C_ops.max_pool3d_with_index(x, 'pooling_type', 'max',
-                                                'ksize', output_size,
-                                                'adaptive', True)
+        pool_out = _legacy_C_ops.max_pool3d_with_index(x, 'pooling_type', 'max',
+                                                       'ksize', output_size,
+                                                       'adaptive', True)
         return pool_out if return_mask else pool_out[0]
 
     l_type = 'max_pool3d_with_index'
diff --git a/python/paddle/nn/functional/sparse_attention.py b/python/paddle/nn/functional/sparse_attention.py
index 53be014527815..77327bae5204f 100644
--- a/python/paddle/nn/functional/sparse_attention.py
+++ b/python/paddle/nn/functional/sparse_attention.py
@@ -16,7 +16,7 @@
 import paddle
 from ...fluid.framework import default_main_program
 from paddle.fluid.layer_helper import LayerHelper
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle import in_dynamic_mode
 
 
@@ -144,7 +144,7 @@ def sparse_attention(query,
             #       [1.99830270, 2.99830270]]]]
     """
     if in_dynamic_mode():
-        result_attention, result_sdd, result_softmax = _C_ops.sparse_attention(
+        result_attention, result_sdd, result_softmax = _legacy_C_ops.sparse_attention(
             query, key, value, sparse_csr_offset, sparse_csr_columns,
             key_padding_mask, attn_mask)
         return result_attention
diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py
index 0b562e515ea6c..c1b1d5fca4bf5 100644
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -18,7 +18,7 @@
 from ...fluid.data_feeder import check_variable_and_dtype
 from ...fluid import dygraph_utils
 import numpy as np
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from ...device import is_compiled_with_rocm
 from paddle import in_dynamic_mode
 from paddle.fluid.framework import in_dygraph_mode, _in_legacy_dygraph
@@ -89,14 +89,13 @@ def affine_grid(theta, out_shape, align_corners=True, name=None):
     if in_dygraph_mode():
         _out_shape = out_shape.numpy().tolist() if isinstance(
             out_shape, Variable) else out_shape
-        return _C_ops.final_state_affine_grid(theta, _out_shape, use_cudnn,
-                                              align_corners)
+        return _C_ops.affine_grid(theta, _out_shape, use_cudnn, align_corners)
     elif in_dynamic_mode():
         _out_shape = out_shape.numpy().tolist() if isinstance(
             out_shape, Variable) else out_shape
-        return _C_ops.affine_grid(theta, "output_shape", _out_shape,
-                                  "align_corners", align_corners, "use_cudnn",
-                                  use_cudnn)
+        return _legacy_C_ops.affine_grid(theta, "output_shape", _out_shape,
+                                         "align_corners", align_corners,
+                                         "use_cudnn", use_cudnn)
 
     helper = LayerHelper('affine_grid')
     check_variable_and_dtype(theta, 'theta', ['float32', 'float64'],
@@ -277,12 +276,11 @@ def grid_sample(x,
         grid.stop_gradient = False
 
     if in_dygraph_mode():
-        return _C_ops.final_state_grid_sample(x, grid, mode, padding_mode,
-                                              align_corners)
+        return _C_ops.grid_sample(x, grid, mode, padding_mode, align_corners)
     elif in_dynamic_mode():
         attrs = ('mode', mode, 'padding_mode', padding_mode, 'align_corners',
                  align_corners, 'use_cudnn', use_cudnn)
-        out = getattr(_C_ops, 'grid_sampler')(x, grid, *attrs)
+        out = getattr(_legacy_C_ops, 'grid_sampler')(x, grid, *attrs)
     else:
         helper = LayerHelper("grid_sample", **locals())
         check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'grid_sample')
@@ -336,11 +334,11 @@ def pixel_shuffle(x, upscale_factor, data_format="NCHW", name=None):
             "Attr(data_format) should be 'NCHW' or 'NHWC'."
             "But recevie Attr(data_format): {} ".format(data_format))
     if in_dygraph_mode():
-        return _C_ops.final_state_pixel_shuffle(x, upscale_factor, data_format)
+        return _C_ops.pixel_shuffle(x, upscale_factor, data_format)
 
     if _in_legacy_dygraph():
-        return _C_ops.pixel_shuffle(x, "upscale_factor", upscale_factor,
-                                    "data_format", data_format)
+        return _legacy_C_ops.pixel_shuffle(x, "upscale_factor", upscale_factor,
+                                           "data_format", data_format)
 
     helper = LayerHelper("pixel_shuffle", **locals())
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'pixel_shuffle')
@@ -395,8 +393,9 @@ def pixel_unshuffle(x, downscale_factor, data_format="NCHW", name=None):
             "But recevie Attr(data_format): {} ".format(data_format))
 
     if _non_static_mode():
-        return _C_ops.pixel_unshuffle(x, "downscale_factor", downscale_factor,
-                                      "data_format", data_format)
+        return _legacy_C_ops.pixel_unshuffle(x, "downscale_factor",
+                                             downscale_factor, "data_format",
+                                             data_format)
 
     helper = LayerHelper("pixel_unshuffle", **locals())
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'pixel_unshuffle')
@@ -463,8 +462,8 @@ def channel_shuffle(x, groups, data_format="NCHW", name=None):
             "But recevie Attr(data_format): {} ".format(data_format))
 
     if _non_static_mode():
-        return _C_ops.channel_shuffle(x, "groups", groups, "data_format",
-                                      data_format)
+        return _legacy_C_ops.channel_shuffle(x, "groups", groups, "data_format",
+                                             data_format)
 
     helper = LayerHelper("channel_shuffle", **locals())
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'channel_shuffle')
diff --git a/python/paddle/nn/initializer/dirac.py b/python/paddle/nn/initializer/dirac.py
index c9f125cd98641..b0ad44b41ef7b 100644
--- a/python/paddle/nn/initializer/dirac.py
+++ b/python/paddle/nn/initializer/dirac.py
@@ -19,7 +19,7 @@
 from ...fluid.framework import _current_expected_place
 from paddle import in_dynamic_mode
 from paddle.utils import unique_name
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from ... import fluid
 
 __all__ = []
@@ -132,8 +132,8 @@ def __call__(self, var, block=None):
         if framework.in_dygraph_mode():
             with fluid.dygraph.no_grad():
                 place = _current_expected_place()
-                _C_ops.final_state_full_(out_var, out_var.shape, str(float(0)),
-                                         out_var.dtype, place)
+                _C_ops.full_(out_var, out_var.shape, str(float(0)),
+                             out_var.dtype, place)
 
         else:
             block.append_op(type='fill_constant',
@@ -171,7 +171,7 @@ def __call__(self, var, block=None):
                 idx_list.append(offset)
         if framework.in_dygraph_mode():
             with fluid.dygraph.no_grad():
-                tmp_out = _C_ops.final_state_reshape(out_var, [-1])
+                tmp_out = _C_ops.reshape(out_var, [-1])
                 tmp_out._share_underline_tensor_to(out_var)
         else:
             x_shape = block.create_var(name=unique_name.generate(".".join(
@@ -198,9 +198,9 @@ def __call__(self, var, block=None):
         if framework.in_dygraph_mode():
             with fluid.dygraph.no_grad():
                 tmp_tensor = framework._varbase_creator()
-                _C_ops.assign_value(tmp_tensor, 'shape', [len(idx_list)],
-                                    'dtype', VarDesc.VarType.INT64,
-                                    'int64_values', idx_list)
+                _legacy_C_ops.assign_value(tmp_tensor, 'shape', [len(idx_list)],
+                                           'dtype', VarDesc.VarType.INT64,
+                                           'int64_values', idx_list)
                 tmp_tensor._share_underline_tensor_to(index_tensor)
         else:
             block.append_op(type='assign_value',
@@ -220,9 +220,10 @@ def __call__(self, var, block=None):
         if framework.in_dygraph_mode():
             with fluid.dygraph.no_grad():
                 tmp_tensor = framework._varbase_creator()
-                _C_ops.assign_value(tmp_tensor, 'shape', [len(value_list)],
-                                    'dtype', VarDesc.VarType.FP32,
-                                    'fp32_values', value_list)
+                _legacy_C_ops.assign_value(tmp_tensor, 'shape',
+                                           [len(value_list)], 'dtype',
+                                           VarDesc.VarType.FP32, 'fp32_values',
+                                           value_list)
                 tmp_tensor._share_underline_tensor_to(value_tensor)
         else:
             block.append_op(type='assign_value',
@@ -236,14 +237,13 @@ def __call__(self, var, block=None):
 
         if framework.in_dygraph_mode():
             with fluid.dygraph.no_grad():
-                tmp_out = _C_ops.final_state_scatter(out_var, index_tensor,
-                                                     value_tensor, True)
+                tmp_out = _C_ops.scatter(out_var, index_tensor, value_tensor,
+                                         True)
                 tmp_out._share_underline_tensor_to(out_var)
-                tmp_reshape_out = _C_ops.final_state_reshape(
-                    out_var, origin_shape)
+                tmp_reshape_out = _C_ops.reshape(out_var, origin_shape)
                 tmp_reshape_out._share_underline_tensor_to(out_var)
                 if var.dtype != VarDesc.VarType.FP32:
-                    tmp_cast_out = _C_ops.final_state_cast(out_var, var.dtype)
+                    tmp_cast_out = _C_ops.cast(out_var, var.dtype)
                     tmp_cast_out._share_underline_tensor_to(var)
         else:
             op = block.append_op(type="scatter",
diff --git a/python/paddle/nn/initializer/orthogonal.py b/python/paddle/nn/initializer/orthogonal.py
index 33634357e7133..63e0152e22b4a 100644
--- a/python/paddle/nn/initializer/orthogonal.py
+++ b/python/paddle/nn/initializer/orthogonal.py
@@ -18,7 +18,7 @@
 from ...tensor import diag, transpose, sign, qr, reshape
 from paddle.utils import unique_name
 from ...fluid.dygraph import no_grad
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 __all__ = []
 
@@ -106,22 +106,23 @@ def __call__(self, var, block=None):
         if framework.in_dygraph_mode():
             with no_grad():
                 place = framework._current_expected_place()
-                normal_var = _C_ops.final_state_gaussian_random(
-                    flatten_shape, 0.0, 1.0, self._seed, var.dtype, place)
-                q, r = _C_ops.final_state_qr(normal_var, 'reduced')
+                normal_var = _C_ops.gaussian_random(flatten_shape, 0.0, 1.0,
+                                                    self._seed, var.dtype,
+                                                    place)
+                q, r = _C_ops.qr(normal_var, 'reduced')
 
-                r_diag = _C_ops.final_state_diag(r, 0, 0)
+                r_diag = _C_ops.diag(r, 0, 0)
 
-                r_sign = _C_ops.final_state_sign(r_diag)
+                r_sign = _C_ops.sign(r_diag)
 
-                q = _C_ops.final_state_multiply(q, r_sign)
+                q = _C_ops.multiply(q, r_sign)
 
                 if row < col:
-                    q = _C_ops.final_state_transpose(q, [1, 0])
+                    q = _C_ops.transpose(q, [1, 0])
 
-                q = _C_ops.final_state_reshape(q, var.shape)
+                q = _C_ops.reshape(q, var.shape)
 
-                tmp = _C_ops.final_state_scale(q, self._gain, 0.0, True)
+                tmp = _C_ops.scale(q, self._gain, 0.0, True)
 
                 tmp._share_underline_tensor_to(var)
 
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index c03864a19d58c..68cc0cedb8f27 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -140,11 +140,10 @@ class GELU(Layer):
 
     Examples:
         .. code-block:: python
-
+        
             import paddle
-            import numpy as np
 
-            x = paddle.to_tensor(np.array([[-1, 0.5],[1, 1.5]]))
+            x = paddle.to_tensor([[-1, 0.5],[1, 1.5]])
 
             m = paddle.nn.GELU()
             out = m(x) # [-0.158655 0.345731 0.841345 1.39979]
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 05230c19c489b..ee5641f5d1257 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -1279,9 +1279,8 @@ def extra_repr(self):
 
 class Embedding(Layer):
     r"""
-    **Embedding Layer**
-
-    This interface is used to construct a callable object of the ``Embedding`` class.
+    
+    Embedding Layer, used to construct a callable object of the ``Embedding`` class.
     For specific usage, refer to code examples. It implements the function of the Embedding Layer.
     This layer is used to lookup embeddings vector of ids provided by :attr:`x` .
     It automatically constructs a 2D embedding matrix based on the
@@ -1290,8 +1289,9 @@ class Embedding(Layer):
     The shape of output Tensor is generated by appending an emb_size dimension to the
     last dimension of the input Tensor shape.
 
-    **Note:** The id in :attr:`x` must satisfy :math:`0 =< id < num_embeddings` ,
-    otherwise the program will throw an exception and exit.
+    Note:
+        The id in :attr:`x` must satisfy :math:`0 =< id < num_embeddings` ,
+        otherwise the program will throw an exception and exit.
 
     .. code-block:: text
 
@@ -1318,23 +1318,23 @@ class Embedding(Layer):
         num_embeddings (int): Just one element which indicate the size
             of the dictionary of embeddings.
         embedding_dim (int):  Just one element which indicate the size of each embedding vector respectively.
-        padding_idx(int|long|None): padding_idx needs to be in the interval [-num_embeddings, num_embeddings).
+        padding_idx(int|long|None, optional): padding_idx needs to be in the interval [-num_embeddings, num_embeddings).
             If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted
             to :math:`vocab\_size + padding\_idx` . It will output all-zero padding data whenever lookup
             encounters :math:`padding\_idx` in id. And the padding data will not be updated while training.
             If set None, it makes no effect to output. Default: None.
-        sparse(bool): The flag indicating whether to use sparse update. This parameter only
+        sparse(bool, optional): The flag indicating whether to use sparse update. This parameter only
             affects the performance of the backwards gradient update. It is recommended to set
             True because sparse update is faster. But some optimizer does not support sparse update,
             such as :ref:`api_paddle_optimizer_adadelta_Adadelta` , :ref:`api_paddle_optimizer_adamax_Adamax` , :ref:`api_paddle_optimizer_lamb_Lamb`.
             In these case, sparse must be False. Default: False.
-        weight_attr(ParamAttr): To specify the weight parameter property. Default: None, which means the
+        weight_attr(ParamAttr, optional): To specify the weight parameter property. Default: None, which means the
             default weight parameter property is used. See usage for details in :ref:`api_ParamAttr` . In addition,
             user-defined or pre-trained word vectors can be loaded with the :attr:`param_attr` parameter.
             The local word vector needs to be transformed into numpy format, and the shape of local word
             vector should be consistent with :attr:`num_embeddings` . Then :ref:`api_initializer_NumpyArrayInitializer`
             is used to load custom or pre-trained word vectors. See code example for details.
-        name(str|None): For detailed information, please refer
+        name(str|None, optional): For detailed information, please refer
                to :ref:`api_guide_Name`. Usually name is no need to set and
                None by default.
 
@@ -1514,7 +1514,7 @@ def extra_repr(self):
 class Fold(Layer):
     r"""
 
-    This Op is used to combines an array of sliding local blocks into a large containing
+    Combines an array of sliding local blocks into a large containing
     tensor. also known as col2im when operated on batched 2D image tensor. Fold calculates each 
     combined value in the resulting large tensor by summing all values from all containing blocks. 
 
@@ -1523,26 +1523,27 @@ class Fold(Layer):
     can be calculated as following.
 
     .. math::
-        H_out &= output_size[0]
-        W_out &= output_size[1]
-        C_out &= C_in / kernel\_sizes[0] / kernel\_sizes[1]
+    
+        H_{out} &= output\_size[0] \\
+        W_{out} &= output\_size[1] \\
+        C_{out} &= \frac{C_{in}}{kernel\_sizes[0]\times kernel\_sizes[1]} \\
 
     Parameters:
         output_sizes(list):       The size of output size, should be [output_size_h, output_size_w]
                                   or an interger o treated as [o, o].
         kernel_sizes(int|list|tuple):   The size of convolution kernel, should be [k_h, k_w]
                                   or an integer k treated as [k, k].
-        strides(int|list|tuple):        The strides, should be [stride_h, stride_w]
+        strides(int|list|tuple, optional):        The strides, should be [stride_h, stride_w]
                                   or an integer stride treated as [sride, stride].
                                   For default, strides will be [1, 1].
-        paddings(int|list|tuple):       The paddings of each dimension, should be
+        paddings(int|list|tuple, optional):       The paddings of each dimension, should be
                                   [padding_top, padding_left, padding_bottom, padding_right]
                                   or [padding_h, padding_w] or an integer padding.
                                   If [padding_h, padding_w] was given, it will expanded to
                                   [padding_h, padding_w, padding_h, padding_w]. If an integer
                                   padding was given, [padding, padding, padding, padding] will
                                   be used. For default, paddings will be [0, 0, 0, 0]
-        dilations(int|list|tuple):      the dilations of convolution kernel, should be
+        dilations(int|list|tuple, optional):      the dilations of convolution kernel, should be
                                   [dilation_h, dilation_w], or an integer dilation treated as
                                   [dilation, dilation]. For default, it will be [1, 1].
         name(str, optional): The default value is None.
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index 6612c0e75ce16..1ff37afa1412e 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -602,8 +602,7 @@ def forward(self, input, label):
                                                        'MSELoss')
 
         if in_dygraph_mode():
-            square_out = paddle._C_ops.final_state_square(
-                paddle.subtract(input, label))
+            square_out = paddle._C_ops.square(paddle.subtract(input, label))
         else:
             square_out = paddle.square(paddle.subtract(input, label))
         if self.reduction == 'none':
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index ac65aab07a56c..5515554077647 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -46,7 +46,7 @@
 import warnings
 from ...framework import no_grad
 from .. import functional as F
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from .. import Layer
 from paddle import in_dynamic_mode
 from paddle.fluid.framework import in_dygraph_mode, _in_legacy_dygraph
@@ -412,15 +412,14 @@ def forward(self, input):
             dtype=input.dtype, stop_gradient=True)
 
         if in_dygraph_mode():
-            pre_act = _C_ops.final_state_group_norm(input, self.weight,
-                                                    self.bias, self._epsilon,
-                                                    self._num_groups, "NCHW")
+            pre_act = _C_ops.group_norm(input, self.weight, self.bias,
+                                        self._epsilon, self._num_groups, "NCHW")
 
             return dygraph_utils._append_activation_in_dygraph(pre_act,
                                                                act=None)
 
         elif _in_legacy_dygraph():
-            pre_act, _, _ = _C_ops.group_norm(
+            pre_act, _, _ = _legacy_C_ops.group_norm(
                 input,
                 self.weight,
                 self.bias,
@@ -1110,7 +1109,7 @@ def forward(self, x):
         ### train mode: use mini-batch stats, eval mode: use global stats
         ### use_global_stats only support False in sync_batch_norm
         if in_dygraph_mode():
-            sync_batch_norm_out, _, _, _, _, _ = _C_ops.final_state_sync_batch_norm_(
+            sync_batch_norm_out, _, _, _, _, _ = _C_ops.sync_batch_norm_(
                 x, self.weight, self.bias, self._mean, self._variance,
                 self._momentum, self._epsilon, self._data_format,
                 not self.training, False, False, False)
@@ -1122,7 +1121,7 @@ def forward(self, x):
                      self._data_format, "use_mkldnn", False, "fuse_with_relu",
                      False, "use_global_stats", False, 'trainable_statistics',
                      False)
-            sync_batch_norm_out, _, _, _, _, _ = _C_ops.sync_batch_norm(
+            sync_batch_norm_out, _, _, _, _, _ = _legacy_C_ops.sync_batch_norm(
                 x, self.weight, self.bias, self._mean, self._variance, mean_out,
                 variance_out, *attrs)
             return sync_batch_norm_out
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index 46ae56a463ea3..ccba13316a17b 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -53,22 +53,15 @@ class AvgPool1D(Layer):
         name(str, optional): For eed to detailed information, please refer to :ref:`api_guide_Name`.
             Usually name is no nset and None by default.
 
-    Returns:
-        A callable object of AvgPool1D.
-
-    Raises:
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ValueError: If `padding` is a list or tuple but its length greater than 1.
-        ShapeError: If the input is not a 3-D tensor.
-        ShapeError: If the output's shape calculated is not greater than 0.
-
     Shape:
         - x(Tensor): The input tensor of avg pool1d operator, which is a 3-D tensor.
           The data type can be float32, float64.
         - output(Tensor): The output tensor of avg pool1d  operator, which is a 3-D tensor.
           The data type is same as input x.
 
+    Returns:
+        A callable object of AvgPool1D.
+        
     Examples:
 
         .. code-block:: python
@@ -164,10 +157,7 @@ class AvgPool2D(Layer):
 
     Returns:
         A callable object of AvgPool2D.
-    Raises:
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ShapeError: If the output's shape calculated is not greater than 0.
+
     Examples:
         .. code-block:: python
 
@@ -255,10 +245,6 @@ class AvgPool3D(Layer):
 
     Returns:
         A callable object of AvgPool3D.
-    Raises:
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ShapeError: If the output's shape calculated is not greater than 0.
 
     Shape:
         - x(Tensor): The input tensor of avg pool3d operator, which is a 5-D tensor.
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index 53dfad4106de0..72b7e374c517c 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -32,7 +32,7 @@
 from paddle.fluid.layers import utils
 from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as
 from paddle.fluid.data_feeder import convert_dtype
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle import in_dynamic_mode
 from paddle.framework import core
 from paddle.static import default_startup_program
@@ -984,10 +984,12 @@ def flatten_parameters(self):
                 dtype=core.VarDesc.VarType.UINT8)
             if in_dynamic_mode():
                 with paddle.no_grad():
-                    _C_ops.coalesce_tensor(self._all_weights, self._all_weights,
-                                           self._flat_weight[0], "copy_data",
-                                           True, "use_align", False, "dtype",
-                                           params[0].dtype)
+                    _legacy_C_ops.coalesce_tensor(self._all_weights,
+                                                  self._all_weights,
+                                                  self._flat_weight[0],
+                                                  "copy_data", True,
+                                                  "use_align", False, "dtype",
+                                                  params[0].dtype)
                     return
             # for static-graph, append coalesce_tensor into startup program
             with program_guard(default_startup_program(),
@@ -1010,7 +1012,7 @@ def _cudnn_impl(self, inputs, initial_states, sequence_length):
             inputs = paddle.tensor.transpose(inputs, [1, 0, 2])
 
         if in_dynamic_mode():
-            _, _, out, state = _C_ops.rnn(
+            _, _, out, state = _legacy_C_ops.rnn(
                 inputs, initial_states, self._all_weights, sequence_length,
                 self._dropout_state, self.state_components, 'dropout_prob',
                 self.dropout, 'is_bidirec', self.num_directions == 2,
diff --git a/python/paddle/nn/quant/quant_layers.py b/python/paddle/nn/quant/quant_layers.py
index b2fc03b1b9003..59d1389f09974 100644
--- a/python/paddle/nn/quant/quant_layers.py
+++ b/python/paddle/nn/quant/quant_layers.py
@@ -23,7 +23,7 @@
 from paddle.nn import functional as F
 import logging
 from paddle.fluid.log_helper import get_logger
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle import in_dynamic_mode
 from paddle.nn import Layer
 
@@ -103,7 +103,7 @@ def forward(self, input):
                     dtype=self._dtype,
                     persistable=False)
                 out_scale.stop_gradient = True
-            out, _, = _C_ops.fake_quantize_dequantize_abs_max(
+            out, _, = _legacy_C_ops.fake_quantize_dequantize_abs_max(
                 input, quant_out, out_scale, *attrs)
             return out
 
@@ -201,7 +201,7 @@ def forward(self, input):
             state = self._state if self.training else None
             accum = self._accum if self.training else None
 
-            out, _, _, _ = _C_ops.fake_quantize_dequantize_moving_average_abs_max(
+            out, _, _, _ = _legacy_C_ops.fake_quantize_dequantize_moving_average_abs_max(
                 input, self._scale, accum, state, quant_out, self._scale, state,
                 accum, *attrs)
 
@@ -294,7 +294,7 @@ def forward(self, input):
                     persistable=False)
                 out_scale.stop_gradient = True
 
-            out, _, = _C_ops.fake_channel_wise_quantize_dequantize_abs_max(
+            out, _, = _legacy_C_ops.fake_channel_wise_quantize_dequantize_abs_max(
                 input, quant_out, out_scale, *attrs)
             return out
 
@@ -389,7 +389,7 @@ def forward(self, input):
             state = self._state if self.training else None
             accum = self._accum if self.training else None
 
-            out, _, _, _ = _C_ops.moving_average_abs_max_scale(
+            out, _, _, _ = _legacy_C_ops.moving_average_abs_max_scale(
                 input, accum, state, quant_out, self._scale, state, accum,
                 *attrs)
             return out
diff --git a/python/paddle/nn/utils/transform_parameters.py b/python/paddle/nn/utils/transform_parameters.py
index c417bcb5cdfdf..44b870a9a4744 100644
--- a/python/paddle/nn/utils/transform_parameters.py
+++ b/python/paddle/nn/utils/transform_parameters.py
@@ -16,7 +16,7 @@
 
 import paddle
 from paddle.fluid.framework import dygraph_only, _dygraph_tracer, _varbase_creator, in_dygraph_mode
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 
 #input==output, inplace strategy of reshape has no cost almostly
@@ -24,7 +24,7 @@ def _inplace_reshape_dygraph(x, shape):
     x_shape = _varbase_creator(dtype='int64')
     if in_dygraph_mode():
         with paddle.fluid.dygraph.no_grad():
-            tmp_out = _C_ops.final_state_reshape(x, shape)
+            tmp_out = _C_ops.reshape(x, shape)
             tmp_out._share_underline_tensor_to(x)
     else:
         _dygraph_tracer().trace_op(type="reshape2",
@@ -103,7 +103,7 @@ def parameters_to_vector(parameters, name=None):
     out = _varbase_creator(dtype=dtype)
     if in_dygraph_mode():
         with paddle.fluid.dygraph.no_grad():
-            tmp = _C_ops.final_state_concat(parameters, 0)
+            tmp = _C_ops.concat(parameters, 0)
             tmp._share_underline_tensor_to(out)
     else:
         _dygraph_tracer().trace_op(type='concat',
@@ -157,7 +157,7 @@ def vector_to_parameters(vec, parameters, name=None):
 
     if in_dygraph_mode():
         with paddle.fluid.dygraph.no_grad():
-            res = _C_ops.final_state_split(vec, sections, 0)
+            res = _C_ops.split(vec, sections, 0)
             for i in range(0, len(parameters)):
                 res[i]._share_underline_tensor_to(parameters[i])
     else:
diff --git a/python/paddle/nn/utils/weight_norm_hook.py b/python/paddle/nn/utils/weight_norm_hook.py
index 4659a330805b4..40c1021848c86 100755
--- a/python/paddle/nn/utils/weight_norm_hook.py
+++ b/python/paddle/nn/utils/weight_norm_hook.py
@@ -19,7 +19,7 @@
 from ...fluid.layer_helper import LayerHelper
 from ...fluid.data_feeder import check_variable_and_dtype
 from ...framework import in_dygraph_mode
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 __all__ = []
 
@@ -29,8 +29,7 @@ def l2_norm(x, axis, epsilon=1e-12, name=None):
         axis = 0
 
     if in_dygraph_mode():
-        out, norm = _C_ops.final_state_norm(x, 1 if axis is None else axis,
-                                            epsilon, False)
+        out, norm = _C_ops.norm(x, 1 if axis is None else axis, epsilon, False)
         return paddle.squeeze(norm, axis=[axis])
 
     check_variable_and_dtype(x, "X", ("float32", "float64"), "norm")
diff --git a/python/paddle/optimizer/adadelta.py b/python/paddle/optimizer/adadelta.py
index 7c3cb3d8e3ed4..f3c15ce479da7 100644
--- a/python/paddle/optimizer/adadelta.py
+++ b/python/paddle/optimizer/adadelta.py
@@ -17,7 +17,7 @@
 from ..fluid import framework
 from ..fluid.framework import Variable, name_scope
 from ..framework import in_dygraph_mode
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from ..fluid.dygraph import no_grad
 
 __all__ = []
@@ -157,11 +157,9 @@ def _append_optimize_op(self, block, param_and_grad):
 
         if in_dygraph_mode():
             with no_grad():
-                _C_ops.final_state_adadelta_(param_and_grad[0],
-                                             param_and_grad[1],
-                                             avg_squared_grad_acc,
-                                             avg_squared_update_acc, self._rho,
-                                             self._epsilon)
+                _C_ops.adadelta_(param_and_grad[0], param_and_grad[1],
+                                 avg_squared_grad_acc, avg_squared_update_acc,
+                                 self._rho, self._epsilon)
             return None
 
         if not isinstance(block, framework.Block):
diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
index 14db8a620ac8f..96ff625f1f9a8 100644
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@@ -26,7 +26,7 @@
 import time
 
 import paddle
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 __all__ = []
 
@@ -342,7 +342,7 @@ def _append_optimize_op(self, block, param_and_grad):
             _beta2 = self._beta2 if not isinstance(
                 self._beta2, Variable) else self._beta2.numpy().item(0)
 
-            _, _, _, _, _, _ = _C_ops.final_state_adam_(
+            _, _, _, _, _, _ = _C_ops.adam_(
                 param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
                 beta1_pow_acc, beta2_pow_acc, master_weight, found_inf, _beta1,
                 _beta2, self._epsilon, self._lazy_mode, 1000, find_master,
@@ -356,7 +356,7 @@ def _append_optimize_op(self, block, param_and_grad):
                 self._beta1, Variable) else self._beta1.numpy().item(0)
             _beta2 = self._beta2 if not isinstance(
                 self._beta2, Variable) else self._beta2.numpy().item(0)
-            _, _, _, _, _, _ = _C_ops.adam(
+            _, _, _, _, _, _ = _legacy_C_ops.adam(
                 param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
                 beta1_pow_acc, beta2_pow_acc, master_weight, param_and_grad[0],
                 moment1, moment2, beta1_pow_acc, beta2_pow_acc, master_weight,
@@ -583,7 +583,7 @@ def _append_optimize_multi_tensor_op(self, target_block,
                     self._beta2, Variable) else self._beta2.numpy().item(0)
 
                 if framework._non_static_mode():
-                    _, _, _, _, _, _ = _C_ops.merged_adam(
+                    _, _, _, _, _, _ = _legacy_C_ops.merged_adam(
                         self._param_dict[key], grad_dict[key], lr_dict[key],
                         self._moment1_dict[key], self._moment2_dict[key],
                         self._beta1_pow_acc_dict[key],
diff --git a/python/paddle/optimizer/adamax.py b/python/paddle/optimizer/adamax.py
index 65aec003786a3..cb07fdb7f56e9 100644
--- a/python/paddle/optimizer/adamax.py
+++ b/python/paddle/optimizer/adamax.py
@@ -16,7 +16,7 @@
 from ..fluid import core
 from ..fluid import framework
 from ..fluid.framework import Variable, name_scope
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from ..fluid.dygraph import no_grad
 
 __all__ = []
@@ -192,16 +192,16 @@ def _append_optimize_op(self, block, param_and_grad):
                                               param_and_grad[0])
 
         if framework.in_dygraph_mode():
-            _C_ops.final_state_adamax_(param_and_grad[0], param_and_grad[1],
-                                       self._create_param_lr(param_and_grad),
-                                       moment, inf_norm, beta1_pow_acc,
-                                       self._beta1, self._beta2, self._epsilon)
+            _C_ops.adamax_(param_and_grad[0], param_and_grad[1],
+                           self._create_param_lr(param_and_grad), moment,
+                           inf_norm, beta1_pow_acc, self._beta1, self._beta2,
+                           self._epsilon)
         elif framework._in_legacy_dygraph():
-            _C_ops.adamax(param_and_grad[0], param_and_grad[1],
-                          self._create_param_lr(param_and_grad), moment,
-                          inf_norm, beta1_pow_acc, param_and_grad[0], moment,
-                          inf_norm, "beta1", self._beta1, "beta2", self._beta2,
-                          "epsilon", self._epsilon)
+            _legacy_C_ops.adamax(param_and_grad[0], param_and_grad[1],
+                                 self._create_param_lr(param_and_grad), moment,
+                                 inf_norm, beta1_pow_acc, param_and_grad[0],
+                                 moment, inf_norm, "beta1", self._beta1,
+                                 "beta2", self._beta2, "epsilon", self._epsilon)
         else:
             # create the adamax optimize op
             adamax_op = block.append_op(
@@ -240,8 +240,8 @@ def _finish_update(self, block, parameters_and_grads):
                     beta1_pow_acc = self._get_accumulator(
                         self._beta1_pow_acc_str, param)
                     with no_grad():
-                        tmp = _C_ops.final_state_scale(beta1_pow_acc,
-                                                       self._beta1, 0.0, True)
+                        tmp = _C_ops.scale(beta1_pow_acc, self._beta1, 0.0,
+                                           True)
                         beta1_pow_acc.copy_(tmp, False)
                     continue
                 with param.block.program._optimized_guard(
@@ -263,8 +263,8 @@ def _finish_update(self, block, parameters_and_grads):
                     self._beta1 = parameters_and_grads.get(
                         'beta1', self._default_dict['beta1'])
                     with no_grad():
-                        tmp = _C_ops.final_state_scale(beta1_pow_acc,
-                                                       self._beta1, 0.0, True)
+                        tmp = _C_ops.scale(beta1_pow_acc, self._beta1, 0.0,
+                                           True)
                         beta1_pow_acc.copy_(tmp, False)
                     continue
 
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
index 0c9e2645ef3e2..fbe23c84a2a91 100644
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -25,7 +25,7 @@
 from ..fluid.clip import GradientClipBase
 from ..fluid.dygraph import base as imperative_base
 from collections.abc import Callable
-from .. import _C_ops
+from .. import _C_ops, _legacy_C_ops
 import paddle
 
 __all__ = []
@@ -443,14 +443,14 @@ def _append_optimize_op(self, block, param_and_grad):
 
             if framework.in_dygraph_mode():
                 found_inf = self._get_auxiliary_var('found_inf')
-                _, _, _, _, _, _ = _C_ops.final_state_adamw_(
+                _, _, _, _, _, _ = _C_ops.adamw_(
                     param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
                     beta1_pow_acc, beta2_pow_acc, master_weight, found_inf,
                     _beta1, _beta2, self._epsilon, lr_ratio_,
                     self._weight_decay, with_decay, self._lazy_mode, 1000,
                     find_master, False)
             else:
-                _, _, _, _, _, _ = _C_ops.adamw(
+                _, _, _, _, _, _ = _legacy_C_ops.adamw(
                     param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
                     beta1_pow_acc, beta2_pow_acc, master_weight,
                     param_and_grad[0], moment1, moment2, beta1_pow_acc,
diff --git a/python/paddle/optimizer/lamb.py b/python/paddle/optimizer/lamb.py
index 5a5f52bb3ef3d..2bdb0a9c73a82 100644
--- a/python/paddle/optimizer/lamb.py
+++ b/python/paddle/optimizer/lamb.py
@@ -19,7 +19,7 @@
 from ..fluid import layers
 from ..fluid import unique_name
 from ..fluid.layer_helper import LayerHelper
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid.executor import global_scope
 import paddle
 
@@ -268,20 +268,20 @@ def _append_optimize_op(self, block, param_and_grad):
         found_inf = self._get_auxiliary_var('found_inf')
 
         if framework.in_dygraph_mode():
-            _C_ops.final_state_lamb_(param_and_grad[0], param_and_grad[1], lr,
-                                     moment1, moment2, beta1_pow_acc,
-                                     beta2_pow_acc, master_weight, found_inf,
-                                     weight_decay, self._beta1, self._beta2,
-                                     self._epsilon, find_master)
+            _C_ops.lamb_(param_and_grad[0], param_and_grad[1], lr, moment1,
+                         moment2, beta1_pow_acc, beta2_pow_acc, master_weight,
+                         found_inf, weight_decay, self._beta1, self._beta2,
+                         self._epsilon, find_master)
             return None
         if framework._non_static_mode():
-            _C_ops.lamb(param_and_grad[0], param_and_grad[1], lr, moment1,
-                        moment2, beta1_pow_acc, beta2_pow_acc, master_weight,
-                        param_and_grad[0], moment1, moment2, beta1_pow_acc,
-                        beta2_pow_acc, master_weight, 'beta1', self._beta1,
-                        'beta2', self._beta2, 'epsilon', self._epsilon,
-                        'weight_decay', weight_decay, 'multi_precision',
-                        find_master)
+            _legacy_C_ops.lamb(param_and_grad[0], param_and_grad[1], lr,
+                               moment1, moment2, beta1_pow_acc, beta2_pow_acc,
+                               master_weight, param_and_grad[0], moment1,
+                               moment2, beta1_pow_acc, beta2_pow_acc,
+                               master_weight, 'beta1', self._beta1, 'beta2',
+                               self._beta2, 'epsilon', self._epsilon,
+                               'weight_decay', weight_decay, 'multi_precision',
+                               find_master)
             return None
 
         # create the lamb optimize op
diff --git a/python/paddle/optimizer/momentum.py b/python/paddle/optimizer/momentum.py
index 3e53619c4cbcc..988ac052b0307 100644
--- a/python/paddle/optimizer/momentum.py
+++ b/python/paddle/optimizer/momentum.py
@@ -23,7 +23,7 @@
 from ..fluid import layers
 import paddle.fluid as fluid
 from paddle.fluid.regularizer import L2DecayRegularizer
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 import paddle
 from paddle.fluid.framework import in_dygraph_mode, _in_legacy_dygraph
 
@@ -316,7 +316,7 @@ def _append_optimize_op(self, block, param_and_grad):
         if _in_legacy_dygraph():
             if isinstance(param_and_grad, dict):
                 self._update_regularization(param_and_grad['weight_decay'])
-            _, _, _ = _C_ops.momentum(
+            _, _, _ = _legacy_C_ops.momentum(
                 param_and_grad[0], param_and_grad[1], velocity_acc, lr,
                 master_weight, param_and_grad[0], velocity_acc, master_weight,
                 'mu', self._momentum, 'use_nesterov', self._use_nesterov,
@@ -327,11 +327,11 @@ def _append_optimize_op(self, block, param_and_grad):
         if in_dygraph_mode():
             if isinstance(param_and_grad, dict):
                 self._update_regularization(param_and_grad['weight_decay'])
-            return _C_ops.final_state_momentum_(
-                param_and_grad[0], param_and_grad[1], velocity_acc, lr,
-                master_weight, self._momentum, self._use_nesterov,
-                regularization_method, regularization_coeff, find_master,
-                self._rescale_grad)
+            return _C_ops.momentum_(param_and_grad[0], param_and_grad[1],
+                                    velocity_acc, lr, master_weight,
+                                    self._momentum, self._use_nesterov,
+                                    regularization_method, regularization_coeff,
+                                    find_master, self._rescale_grad)
 
         attrs = {
             "mu": self._momentum,
@@ -474,7 +474,7 @@ def _append_optimize_multi_tensor_op(self, target_block,
 
                 if framework._non_static_mode():
                     if in_dygraph_mode():
-                        _, _, _ = _C_ops.final_state_merged_momentum_(
+                        _, _, _ = _C_ops.merged_momentum_(
                             self._param_dict[key], grad_dict[key],
                             self._velocity_dict[key], lr_dict[key],
                             self._master_weight_dict[key], self._momentum,
@@ -483,7 +483,7 @@ def _append_optimize_multi_tensor_op(self, target_block,
                             self._regularization_coeff_dict[key], find_master,
                             self._rescale_grad)
                     else:
-                        _, _, _ = _C_ops.merged_momentum(
+                        _, _, _ = _legacy_C_ops.merged_momentum(
                             self._param_dict[key], grad_dict[key],
                             self._velocity_dict[key], lr_dict[key],
                             self._master_weight_dict[key],
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index 9997aba7e31ec..1d399021c8e8d 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -41,7 +41,7 @@
 from .. import compat as cpt
 from .lr import LRScheduler
 import copy
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid.framework import _in_legacy_dygraph, _in_eager_without_dygraph_check, _current_expected_place, in_dygraph_mode
 
 __all__ = []
@@ -475,13 +475,13 @@ def set_lr(self, value):
         if current_lr is not None:
             if in_dygraph_mode():
                 place = _current_expected_place()
-                _C_ops.final_state_full_(current_lr, list(current_lr.shape),
-                                         float(value), current_lr.dtype, place)
+                _C_ops.full_(current_lr, list(current_lr.shape), float(value),
+                             current_lr.dtype, place)
 
             elif _in_legacy_dygraph():
-                _C_ops.fill_constant(current_lr, 'value', float(value), 'dtype',
-                                     current_lr.dtype, 'shape',
-                                     list(current_lr.shape))
+                _legacy_C_ops.fill_constant(current_lr, 'value', float(value),
+                                            'dtype', current_lr.dtype, 'shape',
+                                            list(current_lr.shape))
             else:
                 global_block = framework.default_main_program().global_block()
                 global_block.append_op(type='fill_constant',
@@ -1041,10 +1041,10 @@ def _create_regularization_of_grad(self, param, grad, regularization=None):
 
         if framework.in_dygraph_mode():
             if grad.is_dense() and regularization_term.is_dense():
-                return _C_ops.final_state_add_n([grad, regularization_term])
-            return _C_ops.sum([grad, regularization_term])
+                return _C_ops.add_n([grad, regularization_term])
+            return _legacy_C_ops.sum([grad, regularization_term])
         elif framework._in_legacy_dygraph():
-            return _C_ops.sum([grad, regularization_term])
+            return _legacy_C_ops.sum([grad, regularization_term])
 
         new_grad = grad
         if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
diff --git a/python/paddle/optimizer/sgd.py b/python/paddle/optimizer/sgd.py
index a6b23bbefafa8..1b5b9f4c4f145 100644
--- a/python/paddle/optimizer/sgd.py
+++ b/python/paddle/optimizer/sgd.py
@@ -17,7 +17,7 @@
 from ..fluid import framework
 from ..fluid.framework import Variable, name_scope
 from ..fluid.dygraph import no_grad
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 import warnings
 from ..fluid.layer_helper import LayerHelper
 from ..fluid import unique_name
@@ -143,12 +143,12 @@ def _append_optimize_op(self, block, param_and_grad):
 
         lr = self._create_param_lr(param_and_grad)
         if in_dygraph_mode():
-            _C_ops.final_state_sgd_(param_and_grad[0], lr, param_and_grad[1],
-                                    master_weight, find_master)
+            _C_ops.sgd_(param_and_grad[0], lr, param_and_grad[1], master_weight,
+                        find_master)
             return None
         if _in_legacy_dygraph():
-            _C_ops.sgd(param_and_grad[0], lr, param_and_grad[1], master_weight,
-                       param_and_grad[0], master_weight)
+            _legacy_C_ops.sgd(param_and_grad[0], lr, param_and_grad[1],
+                              master_weight, param_and_grad[0], master_weight)
             return None
 
         assert isinstance(block, framework.Block)
diff --git a/python/paddle/quantization/__init__.py b/python/paddle/quantization/__init__.py
new file mode 100644
index 0000000000000..77da408d8e001
--- /dev/null
+++ b/python/paddle/quantization/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...fluid.contrib.slim.quantization.imperative.ptq_config import PTQConfig, default_ptq_config
+from ...fluid.contrib.slim.quantization.imperative.ptq_quantizer import BaseQuantizer
+from ...fluid.contrib.slim.quantization.imperative.ptq_quantizer import AbsmaxQuantizer
+from ...fluid.contrib.slim.quantization.imperative.ptq_quantizer import PerChannelAbsmaxQuantizer
+from ...fluid.contrib.slim.quantization.imperative.ptq_quantizer import KLQuantizer
+from ...fluid.contrib.slim.quantization.imperative.ptq_quantizer import HistQuantizer
+from ...fluid.contrib.slim.quantization.imperative.ptq_quantizer import SUPPORT_ACT_QUANTIZERS
+from ...fluid.contrib.slim.quantization.imperative.ptq_quantizer import SUPPORT_WT_QUANTIZERS
+from ...fluid.contrib.slim.quantization.imperative.ptq_registry import PTQRegistry
+from ...fluid.contrib.slim.quantization.imperative.ptq import ImperativePTQ
+from ...fluid.contrib.slim.quantization.imperative.qat import ImperativeQuantAware
diff --git a/python/paddle/signal.py b/python/paddle/signal.py
index 5c0d1d5edb821..656605f1bf2b7 100644
--- a/python/paddle/signal.py
+++ b/python/paddle/signal.py
@@ -21,7 +21,7 @@
 from .fluid.data_feeder import check_variable_and_dtype
 from .fluid.framework import _non_static_mode
 from .fluid.layer_helper import LayerHelper
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid.framework import in_dygraph_mode, _in_legacy_dygraph
 
 __all__ = [
@@ -129,12 +129,12 @@ def frame(x, frame_length, hop_length, axis=-1, name=None):
     op_type = 'frame'
 
     if in_dygraph_mode():
-        return _C_ops.final_state_frame(x, frame_length, hop_length, axis)
+        return _C_ops.frame(x, frame_length, hop_length, axis)
 
     if _in_legacy_dygraph():
         attrs = ('frame_length', frame_length, 'hop_length', hop_length, 'axis',
                  axis)
-        op = getattr(_C_ops, op_type)
+        op = getattr(_legacy_C_ops, op_type)
         out = op(x, *attrs)
     else:
         check_variable_and_dtype(
@@ -218,10 +218,10 @@ def overlap_add(x, hop_length, axis=-1, name=None):
     op_type = 'overlap_add'
 
     if in_dygraph_mode():
-        out = _C_ops.final_state_overlap_add(x, hop_length, axis)
+        out = _C_ops.overlap_add(x, hop_length, axis)
     elif paddle.in_dynamic_mode():
         attrs = ('hop_length', hop_length, 'axis', axis)
-        op = getattr(_C_ops, op_type)
+        op = getattr(_legacy_C_ops, op_type)
         out = op(x, *attrs)
     else:
         check_variable_and_dtype(
diff --git a/python/paddle/static/quantization/__init__.py b/python/paddle/static/quantization/__init__.py
new file mode 100644
index 0000000000000..325fee5624bff
--- /dev/null
+++ b/python/paddle/static/quantization/__init__.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...fluid.contrib.slim.quantization.quantization_pass import QuantizationTransformPass
+from ...fluid.contrib.slim.quantization.quantization_pass import QuantizationFreezePass
+from ...fluid.contrib.slim.quantization.quantization_pass import ConvertToInt8Pass
+from ...fluid.contrib.slim.quantization.quantization_pass import TransformForMobilePass
+from ...fluid.contrib.slim.quantization.quantization_pass import OutScaleForTrainingPass
+from ...fluid.contrib.slim.quantization.quantization_pass import OutScaleForInferencePass
+from ...fluid.contrib.slim.quantization.quantization_pass import AddQuantDequantPass
+from ...fluid.contrib.slim.quantization.quantization_pass import ReplaceFakeQuantDequantPass
+from ...fluid.contrib.slim.quantization.quantization_pass import QuantWeightPass
+from ...fluid.contrib.slim.quantization.quantization_pass import QuantizationTransformPassV2
+from ...fluid.contrib.slim.quantization.quantization_pass import AddQuantDequantPassV2
+from ...fluid.contrib.slim.quantization.quant_int8_mkldnn_pass import QuantInt8MkldnnPass
+from ...fluid.contrib.slim.quantization.quant2_int8_mkldnn_pass import Quant2Int8MkldnnPass
+
+from ...fluid.contrib.slim.quantization.post_training_quantization import PostTrainingQuantization
+from ...fluid.contrib.slim.quantization.post_training_quantization import PostTrainingQuantizationProgram
+from ...fluid.contrib.slim.quantization.post_training_quantization import WeightQuantization
diff --git a/python/paddle/tensor/attribute.py b/python/paddle/tensor/attribute.py
index e3bd7bae7d468..f575092153b41 100644
--- a/python/paddle/tensor/attribute.py
+++ b/python/paddle/tensor/attribute.py
@@ -24,7 +24,7 @@
 
 # TODO: define functions to get tensor attributes
 import paddle
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from ..static import Variable
 from ..fluid.framework import _in_legacy_dygraph, in_dygraph_mode
 
@@ -115,11 +115,11 @@ def shape(input):
             print(res) # [array([  3, 100, 100], dtype=int32)]
     """
     if in_dygraph_mode():
-        out = _C_ops.final_state_shape(input)
+        out = _C_ops.shape(input)
         out.stop_gradient = True
         return out
     if _in_legacy_dygraph():
-        out = _C_ops.shape(input)
+        out = _legacy_C_ops.shape(input)
         out.stop_gradient = True
         return out
 
@@ -277,9 +277,9 @@ def real(x, name=None):
             #         [4., 5., 6.]])
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_real(x)
-    if _in_legacy_dygraph():
         return _C_ops.real(x)
+    if _in_legacy_dygraph():
+        return _legacy_C_ops.real(x)
 
     check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], 'real')
     helper = LayerHelper('real', **locals())
@@ -323,9 +323,9 @@ def imag(x, name=None):
             #         [3., 2., 1.]])
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_imag(x)
-    if _in_legacy_dygraph():
         return _C_ops.imag(x)
+    if _in_legacy_dygraph():
+        return _legacy_C_ops.imag(x)
 
     check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], 'imag')
     helper = LayerHelper('imag', **locals())
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 17393db9b4cce..272c7f2ed8b67 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -28,7 +28,7 @@
 from ..framework import convert_np_dtype_to_dtype_, _varbase_creator, OpProtoHolder
 # TODO: define functions to get create a tensor
 import paddle
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from ..fluid.framework import _in_legacy_dygraph, _in_eager_without_dygraph_check
 import warnings
 
@@ -100,11 +100,10 @@ def linspace(start, stop, num, dtype=None, name=None):
         with device_guard("cpu"):
             tensor_num = fill_constant([1], 'int32', num, force_cpu=True)
     if in_dygraph_mode():
-        return _C_ops.final_state_linspace(tensor_start, tensor_stop,
-                                           tensor_num, dtype)
+        return _C_ops.linspace(tensor_start, tensor_stop, tensor_num, dtype)
     if _in_legacy_dygraph():
-        return _C_ops.linspace(tensor_start, tensor_stop, tensor_num, 'dtype',
-                               dtype)
+        return _legacy_C_ops.linspace(tensor_start, tensor_stop, tensor_num,
+                                      'dtype', dtype)
 
     helper = LayerHelper("linspace", **locals())
 
@@ -212,8 +211,8 @@ def logspace(start, stop, num, base=10.0, dtype=None, name=None):
         with device_guard("cpu"):
             tensor_base = fill_constant([1], dtype, base)
     if _non_static_mode():
-        return _C_ops.logspace(tensor_start, tensor_stop, tensor_num,
-                               tensor_base, 'dtype', dtype)
+        return _legacy_C_ops.logspace(tensor_start, tensor_stop, tensor_num,
+                                      tensor_base, 'dtype', dtype)
 
     helper = LayerHelper("logspace", **locals())
 
@@ -504,10 +503,11 @@ def full_like(x, fill_value, dtype=None, name=None):
             dtype = convert_np_dtype_to_dtype_(dtype)
 
     if in_dygraph_mode():
-        return _C_ops.final_state_full_like(x, fill_value, dtype, x.place)
+        return _C_ops.full_like(x, fill_value, dtype, x.place)
 
     if _in_legacy_dygraph():
-        return _C_ops.fill_any_like(x, 'value', fill_value, 'dtype', dtype)
+        return _legacy_C_ops.fill_any_like(x, 'value', fill_value, 'dtype',
+                                           dtype)
 
     helper = LayerHelper("full_like", **locals())
     check_variable_and_dtype(
@@ -717,11 +717,11 @@ def eye(num_rows, num_columns=None, dtype=None, name=None):
 
     if _non_static_mode():
         if in_dygraph_mode():
-            out = _C_ops.final_state_eye(num_rows, num_columns, dtype,
-                                         _current_expected_place())
+            out = _C_ops.eye(num_rows, num_columns, dtype,
+                             _current_expected_place())
         elif _in_legacy_dygraph():
-            out = _C_ops.eye('dtype', dtype, 'num_rows', num_rows,
-                             'num_columns', num_columns)
+            out = _legacy_C_ops.eye('dtype', dtype, 'num_rows', num_rows,
+                                    'num_columns', num_columns)
 
     else:
         helper = LayerHelper("eye", **locals())
@@ -883,11 +883,10 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
         step = paddle.cast(step, dtype)
 
     if in_dygraph_mode():
-        return _C_ops.final_state_arange(start, end, step, dtype,
-                                         _current_expected_place())
+        return _C_ops.arange(start, end, step, dtype, _current_expected_place())
 
     if _in_legacy_dygraph():
-        out = _C_ops.range(start, end, step)
+        out = _legacy_C_ops.range(start, end, step)
         out.stop_gradient = True
         return out
 
@@ -999,10 +998,10 @@ def tril(x, diagonal=0, name=None):
             #         [9 , 10, 0 , 0 ]])
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_tril_triu(x, diagonal, True)
+        return _C_ops.tril_triu(x, diagonal, True)
 
     if _in_legacy_dygraph():
-        op = getattr(_C_ops, 'tril_triu')
+        op = getattr(_legacy_C_ops, 'tril_triu')
         return op(x, 'diagonal', diagonal, "lower", True)
 
     return _tril_triu_op(LayerHelper('tril', **locals()))
@@ -1064,10 +1063,10 @@ def triu(x, diagonal=0, name=None):
 
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_tril_triu(x, diagonal, False)
+        return _C_ops.tril_triu(x, diagonal, False)
 
     if _in_legacy_dygraph():
-        op = getattr(_C_ops, 'tril_triu')
+        op = getattr(_legacy_C_ops, 'tril_triu')
         return op(x, 'diagonal', diagonal, "lower", False)
 
     return _tril_triu_op(LayerHelper('triu', **locals()))
@@ -1109,10 +1108,10 @@ def meshgrid(*args, **kwargs):
         args = args[0]
     if _in_legacy_dygraph():
         num = len(args)
-        out = _C_ops.meshgrid(list(args), num)
+        out = _legacy_C_ops.meshgrid(list(args), num)
         return out
     if in_dygraph_mode():
-        return _C_ops.final_state_meshgrid(list(args))
+        return _C_ops.meshgrid(list(args))
 
     name = kwargs.get("name", None)
     helper = LayerHelper('meshgrid', **locals())
@@ -1219,20 +1218,20 @@ def diagflat(x, offset=0, name=None):
     padding_value = 0
     if in_dygraph_mode():
         if len(x.shape) == 1:
-            return _C_ops.final_state_diag(x, offset, padding_value)
+            return _C_ops.diag(x, offset, padding_value)
         else:
-            y = _C_ops.final_state_flatten(x, 0, -1)
-            return _C_ops.final_state_diag(y, offset, padding_value)
+            y = _C_ops.flatten(x, 0, -1)
+            return _C_ops.diag(y, offset, padding_value)
 
     if _in_legacy_dygraph():
         if len(x.shape) == 1:
-            return _C_ops.diag_v2(x, "offset", offset, "padding_value",
-                                  padding_value)
+            return _legacy_C_ops.diag_v2(x, "offset", offset, "padding_value",
+                                         padding_value)
         else:
-            y, _ = _C_ops.flatten_contiguous_range(x, "start_axis", 0,
-                                                   "stop_axis", -1)
-            return _C_ops.diag_v2(y, "offset", offset, "padding_value",
-                                  padding_value)
+            y, _ = _legacy_C_ops.flatten_contiguous_range(
+                x, "start_axis", 0, "stop_axis", -1)
+            return _legacy_C_ops.diag_v2(y, "offset", offset, "padding_value",
+                                         padding_value)
 
     check_type(x, 'x', (Variable), 'diagflat')
     check_dtype(x.dtype, 'x', ['float32', 'float64', 'int32', 'int64'],
@@ -1346,11 +1345,11 @@ def diag(x, offset=0, padding_value=0, name=None):
             # [4]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_diag(x, offset, padding_value)
+        return _C_ops.diag(x, offset, padding_value)
     else:
         if _in_legacy_dygraph():
-            return _C_ops.diag_v2(x, "offset", offset, "padding_value",
-                                  padding_value)
+            return _legacy_C_ops.diag_v2(x, "offset", offset, "padding_value",
+                                         padding_value)
         else:
             check_type(x, 'x', (Variable), 'diag_v2')
             check_dtype(x.dtype, 'x', ['float32', 'float64', 'int32', 'int64'],
@@ -1434,15 +1433,15 @@ def empty(shape, dtype=None, name=None):
 
     if in_dygraph_mode():
         shape = utils.convert_shape_to_list(shape)
-        out = _C_ops.final_state_empty(shape, convert_np_dtype_to_dtype_(dtype),
-                                       _current_expected_place())
+        out = _C_ops.empty(shape, convert_np_dtype_to_dtype_(dtype),
+                           _current_expected_place())
         out.stop_gradient = True
         return out
 
     if _in_legacy_dygraph():
         shape = utils.convert_shape_to_list(shape)
-        out = _C_ops.empty('shape', shape, 'dtype',
-                           convert_np_dtype_to_dtype_(dtype))
+        out = _legacy_C_ops.empty('shape', shape, 'dtype',
+                                  convert_np_dtype_to_dtype_(dtype))
         out.stop_gradient = True
         return out
 
@@ -1507,15 +1506,14 @@ def empty_like(x, dtype=None, name=None):
     dtype = convert_dtype(dtype)
 
     if in_dygraph_mode():
-        out = _C_ops.final_state_empty(x.shape,
-                                       convert_np_dtype_to_dtype_(dtype),
-                                       _current_expected_place())
+        out = _C_ops.empty(x.shape, convert_np_dtype_to_dtype_(dtype),
+                           _current_expected_place())
         out.stop_gradient = True
         return out
 
     if _in_legacy_dygraph():
-        out = _C_ops.empty('shape', x.shape, 'dtype',
-                           convert_np_dtype_to_dtype_(dtype))
+        out = _legacy_C_ops.empty('shape', x.shape, 'dtype',
+                                  convert_np_dtype_to_dtype_(dtype))
         out.stop_gradient = True
         return out
 
@@ -1592,13 +1590,13 @@ def assign(x, output=None):
     if isinstance(input, (Variable, core.VarBase, core.eager.Tensor)):
         if in_dygraph_mode():
             if output is None:
-                output = _C_ops.final_state_assign(input)
+                output = _C_ops.assign(input)
             else:
-                _C_ops.final_state_assign_out_(input, output)
+                _C_ops.assign_out_(input, output)
         elif _in_legacy_dygraph():
             if output is None:
                 output = core.VarBase()
-            _C_ops.assign(input, output)
+            _legacy_C_ops.assign(input, output)
         else:
             check_dtype(input.dtype, 'input', [
                 'float16', 'uint16', 'float32', 'float64', 'int32', 'int64',
@@ -1671,13 +1669,13 @@ def convert_scalar(x):
         if in_dygraph_mode():
             if output is None:
                 output = zeros(list(input.shape), dtype)
-            _C_ops.final_state_assign_value_(output, list(input.shape), dtype,
-                                             values, _current_expected_place())
+            _C_ops.assign_value_(output, list(input.shape), dtype, values,
+                                 _current_expected_place())
         elif _in_legacy_dygraph():
             if output is None:
                 output = core.VarBase()
-            _C_ops.assign_value(output, 'shape', list(input.shape), 'dtype',
-                                dtype, value_name, values)
+            _legacy_C_ops.assign_value(output, 'shape', list(input.shape),
+                                       'dtype', dtype, value_name, values)
         else:
             if output is None:
                 output = helper.create_variable_for_type_inference(
@@ -1813,10 +1811,10 @@ def complex(real, imag, name=None):
             #  [1.+0.j 1.+1.j 1.+2.j]]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_complex(real, imag)
+        return _C_ops.complex(real, imag)
 
     if paddle.in_dynamic_mode():
-        return paddle._C_ops.complex(real, imag)
+        return paddle._legacy_C_ops.complex(real, imag)
 
     check_variable_and_dtype(real, 'real', ['float32', 'float64'], 'complex')
     check_variable_and_dtype(imag, 'imag', ['float32', 'float64'], 'complex')
@@ -1893,13 +1891,13 @@ def tril_indices(row, col, offset=0, dtype='int64'):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
     if in_dygraph_mode():
-        out = _C_ops.final_state_tril_indices(row, col, offset, dtype,
-                                              _current_expected_place())
+        out = _C_ops.tril_indices(row, col, offset, dtype,
+                                  _current_expected_place())
         return out
 
     if _in_legacy_dygraph():
-        out = _C_ops.tril_indices('rows', row, 'cols', col, 'offset', offset,
-                                  "dtype", dtype)
+        out = _legacy_C_ops.tril_indices('rows', row, 'cols', col, 'offset',
+                                         offset, "dtype", dtype)
         return out
 
     else:
@@ -1978,13 +1976,13 @@ def triu_indices(row, col=None, offset=0, dtype='int64'):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
     if in_dygraph_mode():
-        out = _C_ops.final_state_triu_indices(row, col, offset, dtype,
-                                              _current_expected_place())
+        out = _C_ops.triu_indices(row, col, offset, dtype,
+                                  _current_expected_place())
         return out
 
     if _in_legacy_dygraph():
-        out = _C_ops.triu_indices('row', row, 'col', col, 'offset', offset,
-                                  "dtype", dtype)
+        out = _legacy_C_ops.triu_indices('row', row, 'col', col, 'offset',
+                                         offset, "dtype", dtype)
         return out
 
     else:
diff --git a/python/paddle/tensor/einsum.py b/python/paddle/tensor/einsum.py
index 34a1ead2cb497..55726831d2e35 100644
--- a/python/paddle/tensor/einsum.py
+++ b/python/paddle/tensor/einsum.py
@@ -21,7 +21,7 @@
 from .math import multiply
 from .math import sum as paddle_sum
 from ..fluid.framework import _in_legacy_dygraph
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.framework import _non_static_mode, in_dygraph_mode, _in_legacy_dygraph
@@ -803,12 +803,12 @@ def gen_einsum_op(equation, *operands):
     """
     assert len(operands) <= 2, "Only support two operands in EinsumOp."
     if in_dygraph_mode():
-        return _C_ops.final_state_einsum(operands, equation)[0]
+        return _C_ops.einsum(operands, equation)[0]
 
     if _in_legacy_dygraph():
         # dygraph
-        return _C_ops.einsum(operands, len(operands), len(operands), 'equation',
-                             equation)[0]
+        return _legacy_C_ops.einsum(operands, len(operands), len(operands),
+                                    'equation', equation)[0]
 
     for inp in operands:
         check_variable_and_dtype(inp, 'dtype', ['float32', 'float64'], 'einsum')
diff --git a/python/paddle/tensor/layer_function_generator.py b/python/paddle/tensor/layer_function_generator.py
index 15d81624591f1..f9c32c3254b78 100644
--- a/python/paddle/tensor/layer_function_generator.py
+++ b/python/paddle/tensor/layer_function_generator.py
@@ -25,7 +25,7 @@
 from ..framework import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype
 import paddle
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 __all__ = []
 
@@ -258,14 +258,13 @@ def generate_activation_fn(op_type):
     op_proto = OpProtoHolder.instance().get_op_proto(op_type)
 
     def func(x, name=None):
-        final_state_op_type = "final_state_%s" % op_type
-        if in_dygraph_mode() and hasattr(_C_ops, final_state_op_type):
-            op = getattr(_C_ops, final_state_op_type)
+        if in_dygraph_mode() and hasattr(_C_ops, op_type):
+            op = getattr(_C_ops, op_type)
             return op(x)
         # TODO(dev): Because some ops' yaml has not been migrated.
         # Replace it with _in_legacy_dygraph while all yaml work is done.
         if _non_static_mode():
-            op = getattr(_C_ops, op_type)
+            op = getattr(_legacy_C_ops, op_type)
             return op(x)
 
         if op_type not in ["abs", "exp", "square"]:
@@ -305,12 +304,11 @@ def generate_inplace_fn(inplace_op_type):
     origin_op_type = inplace_op_type[:-1]
 
     def func(x, name=None):
-        final_state_inplace_op_type = "final_state_%s" % inplace_op_type
-        if in_dygraph_mode() and hasattr(_C_ops, final_state_inplace_op_type):
-            op = getattr(_C_ops, final_state_inplace_op_type)
+        if in_dygraph_mode() and hasattr(_C_ops, inplace_op_type):
+            op = getattr(_C_ops, inplace_op_type)
             return op(x)
         if _non_static_mode():
-            op = getattr(_C_ops, inplace_op_type)
+            op = getattr(_legacy_C_ops, inplace_op_type)
             return op(x)
         warnings.warn(
             "In static mode, {}() is the same as {}() and does not perform inplace operation."
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 593ce07f32ecf..652cfb37289c1 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -27,7 +27,7 @@
 import warnings
 from paddle.common_ops_import import core
 from paddle.common_ops_import import VarDesc
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 __all__ = []
 
@@ -86,10 +86,10 @@ def transpose(x, perm, name=None):
 
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_transpose(x, perm)
+        return _C_ops.transpose(x, perm)
     else:
         if _in_legacy_dygraph():
-            out, _ = _C_ops.transpose2(x, 'axis', perm)
+            out, _ = _legacy_C_ops.transpose2(x, 'axis', perm)
             return out
 
     check_variable_and_dtype(x, 'x', [
@@ -220,11 +220,11 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
 
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_matmul(x, y, transpose_x, transpose_y)
+        return _C_ops.matmul(x, y, transpose_x, transpose_y)
 
     if _in_legacy_dygraph():
         op_type = 'matmul_v2'
-        op = getattr(_C_ops, op_type)
+        op = getattr(_legacy_C_ops, op_type)
         return op(x, y, 'trans_x', transpose_x, 'trans_y', transpose_y)
 
     attrs = {
@@ -341,15 +341,14 @@ def frobenius_norm(input, dim=None, keepdim=False, name=None):
 
         if in_dygraph_mode():
             if dim is None:
-                return _C_ops.final_state_frobenius_norm(
-                    input, [], keepdim, True)
-            return _C_ops.final_state_frobenius_norm(input, dim, keepdim, False)
+                return _C_ops.frobenius_norm(input, [], keepdim, True)
+            return _C_ops.frobenius_norm(input, dim, keepdim, False)
         if _in_legacy_dygraph():
             if dim is None:
-                return _C_ops.frobenius_norm(input, 'keep_dim', keepdim,
-                                             'reduce_all', True)
-            return _C_ops.frobenius_norm(input, 'dim', dim, 'keep_dim', keepdim,
-                                         'reduce_all', False)
+                return _legacy_C_ops.frobenius_norm(input, 'keep_dim', keepdim,
+                                                    'reduce_all', True)
+            return _legacy_C_ops.frobenius_norm(input, 'dim', dim, 'keep_dim',
+                                                keepdim, 'reduce_all', False)
         attrs = {'dim': dim, 'keep_dim': keepdim, 'reduce_all': False}
         if dim is None:
             attrs['reduce_all'] = True
@@ -382,13 +381,13 @@ def vector_norm(input,
         """
         if in_dygraph_mode():
             if axis is None: axis = -1
-            return _C_ops.final_state_p_norm(input, porder, axis, 1e-12,
-                                             keepdim, asvector)
+            return _C_ops.p_norm(input, porder, axis, 1e-12, keepdim, asvector)
 
         if _in_legacy_dygraph():
             if axis is None: axis = -1
-            return _C_ops.p_norm(input, 'porder', porder, 'axis', axis,
-                                 'keepdim', keepdim, 'asvector', asvector)
+            return _legacy_C_ops.p_norm(input, 'porder', porder, 'axis', axis,
+                                        'keepdim', keepdim, 'asvector',
+                                        asvector)
 
         if porder is not None:
             check_type(porder, 'porder', (float, int), 'p_norm')
@@ -421,15 +420,15 @@ def inf_norm(input,
                  asvector=False,
                  name=None):
         if in_dygraph_mode():
-            out = _C_ops.final_state_abs(input)
+            out = _C_ops.abs(input)
             reduce_all = True if axis == None or axis == [] or asvector == True else False
             axis = axis if axis != None and axis != [] else [0]
             if reduce_all:
                 assert (axis == []) or (axis is None)
             if porder == np.float64('inf'):
-                return _C_ops.final_state_max(out, axis, keepdim)
+                return _C_ops.max(out, axis, keepdim)
             else:
-                return _C_ops.final_state_min(out, axis, keepdim)
+                return _C_ops.min(out, axis, keepdim)
 
         helper = LayerHelper('inf_norm', **locals())
         out = helper.create_variable_for_type_inference(
@@ -460,10 +459,10 @@ def p_matrix_norm(input, porder=1., axis=axis, keepdim=False, name=None):
             This function actually treats the matrix as flattened vector to calculate vector norm instead of matrix norm.
         """
         if in_dygraph_mode():
-            abs_out = _C_ops.final_state_abs(input)
-            pow_out = _C_ops.final_state_pow(abs_out, porder)
-            sum_out = _C_ops.final_state_sum(pow_out, axis, None, keepdim)
-            out = _C_ops.final_state_pow(sum_out, float(1. / porder))
+            abs_out = _C_ops.abs(input)
+            pow_out = _C_ops.pow(abs_out, porder)
+            sum_out = _C_ops.sum(pow_out, axis, None, keepdim)
+            out = _C_ops.pow(sum_out, float(1. / porder))
             return out
 
         block = LayerHelper('norm', **locals())
@@ -652,7 +651,7 @@ def dist(x, y, p=2, name=None):
             print(out) # out = [0.]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_dist(x, y, p)
+        return _C_ops.dist(x, y, p)
 
     check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'dist')
     check_variable_and_dtype(y, 'dtype', ['float32', 'float64'], 'dist')
@@ -768,18 +767,21 @@ def mat_norm(input, porder=1., axis=None):
         keepdim = False
 
         if _non_static_mode():
-            abs_out = _C_ops.abs(input)
+            abs_out = _legacy_C_ops.abs(input)
             if in_dygraph_mode():
-                sum_out = _C_ops.final_state_sum(abs_out, axis, None, keepdim)
+                sum_out = _C_ops.sum(abs_out, axis, None, keepdim)
             else:
-                sum_out = _C_ops.reduce_sum(abs_out, 'dim', axis, 'keepdim',
-                                            keepdim, 'reduce_all', reduce_all)
+                sum_out = _legacy_C_ops.reduce_sum(abs_out, 'dim', axis,
+                                                   'keepdim', keepdim,
+                                                   'reduce_all', reduce_all)
             if porder == 1 or porder == np.inf:
-                return _C_ops.reduce_max(sum_out, 'dim', [-1], 'keepdim',
-                                         keepdim, 'reduce_all', reduce_all)
+                return _legacy_C_ops.reduce_max(sum_out, 'dim', [-1], 'keepdim',
+                                                keepdim, 'reduce_all',
+                                                reduce_all)
             if porder == -1 or porder == -np.inf:
-                return _C_ops.reduce_min(sum_out, 'dim', [-1], 'keepdim',
-                                         keepdim, 'reduce_all', reduce_all)
+                return _legacy_C_ops.reduce_min(sum_out, 'dim', [-1], 'keepdim',
+                                                keepdim, 'reduce_all',
+                                                reduce_all)
 
         block = LayerHelper('norm', **locals())
         abs_out = block.create_variable_for_type_inference(
@@ -828,17 +830,19 @@ def fro_norm(input, porder=2, axis=[-1]):
         keepdim = False
 
         if in_dygraph_mode():
-            pow_out = _C_ops.pow(input, 'factor', porder)
-            sum_out_1 = _C_ops.final_state_sum(pow_out, axis, None, keepdim)
-            sum_out_2 = _C_ops.final_state_sum(sum_out_1, axis, None, keepdim)
-            return _C_ops.pow(sum_out_2, 'factor', float(1. / porder))
+            pow_out = _legacy_C_ops.pow(input, 'factor', porder)
+            sum_out_1 = _C_ops.sum(pow_out, axis, None, keepdim)
+            sum_out_2 = _C_ops.sum(sum_out_1, axis, None, keepdim)
+            return _legacy_C_ops.pow(sum_out_2, 'factor', float(1. / porder))
         elif paddle.in_dynamic_mode():
-            pow_out = _C_ops.pow(input, 'factor', porder)
-            sum_out_1 = _C_ops.reduce_sum(pow_out, 'dim', axis, 'keepdim',
-                                          keepdim, 'reduce_all', reduce_all)
-            sum_out_2 = _C_ops.reduce_sum(sum_out_1, 'dim', axis, 'keepdim',
-                                          keepdim, 'reduce_all', reduce_all)
-            return _C_ops.pow(sum_out_2, 'factor', float(1. / porder))
+            pow_out = _legacy_C_ops.pow(input, 'factor', porder)
+            sum_out_1 = _legacy_C_ops.reduce_sum(pow_out, 'dim', axis,
+                                                 'keepdim', keepdim,
+                                                 'reduce_all', reduce_all)
+            sum_out_2 = _legacy_C_ops.reduce_sum(sum_out_1, 'dim', axis,
+                                                 'keepdim', keepdim,
+                                                 'reduce_all', reduce_all)
+            return _legacy_C_ops.pow(sum_out_2, 'factor', float(1. / porder))
 
         block = LayerHelper('norm', **locals())
         pow_out = block.create_variable_for_type_inference(
@@ -889,20 +893,23 @@ def svd_norm(input, porder, axis=[-1]):
         if _non_static_mode():
             if porder == "nuc":
                 if in_dygraph_mode():
-                    return _C_ops.final_state_sum(s, axis, None, keepdim)
+                    return _C_ops.sum(s, axis, None, keepdim)
                 else:
-                    return _C_ops.reduce_sum(s, 'dim', axis, 'keepdim', keepdim,
-                                             'reduce_all', reduce_all)
-            max_out = _C_ops.reduce_max(s, 'dim', axis, 'keepdim', keepdim,
-                                        'reduce_all', reduce_all)
-            min_out = _C_ops.reduce_min(s, 'dim', axis, 'keepdim', keepdim,
-                                        'reduce_all', reduce_all)
+                    return _legacy_C_ops.reduce_sum(s, 'dim', axis, 'keepdim',
+                                                    keepdim, 'reduce_all',
+                                                    reduce_all)
+            max_out = _legacy_C_ops.reduce_max(s, 'dim', axis, 'keepdim',
+                                               keepdim, 'reduce_all',
+                                               reduce_all)
+            min_out = _legacy_C_ops.reduce_min(s, 'dim', axis, 'keepdim',
+                                               keepdim, 'reduce_all',
+                                               reduce_all)
             if porder == 2:
-                return _C_ops.elementwise_div(max_out, min_out, 'aixs', axis,
-                                              'use_mkldnn', False)
+                return _legacy_C_ops.elementwise_div(max_out, min_out, 'aixs',
+                                                     axis, 'use_mkldnn', False)
             if porder == -2:
-                return _C_ops.elementwise_div(min_out, max_out, 'aixs', axis,
-                                              'use_mkldnn', False)
+                return _legacy_C_ops.elementwise_div(min_out, max_out, 'aixs',
+                                                     axis, 'use_mkldnn', False)
 
         block = LayerHelper('norm', **locals())
         out = block.create_variable_for_type_inference(
@@ -1035,9 +1042,9 @@ def dot(x, y, name=None):
 
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_dot(x, y)
-    if _in_legacy_dygraph():
         return _C_ops.dot(x, y)
+    if _in_legacy_dygraph():
+        return _legacy_C_ops.dot(x, y)
 
     op_type = 'dot'
 
@@ -1228,7 +1235,7 @@ def t(input, name=None):
             return input
         # 2-D tensor
         perm = [1, 0]
-        out = _C_ops.final_state_transpose(input, perm)
+        out = _C_ops.transpose(input, perm)
         return out
 
     if _in_legacy_dygraph():
@@ -1236,7 +1243,7 @@ def t(input, name=None):
             return input
         # 2-D tensor
         perm = [1, 0]
-        out, _ = _C_ops.transpose2(input, 'axis', perm)
+        out, _ = _legacy_C_ops.transpose2(input, 'axis', perm)
         return out
 
     check_variable_and_dtype(
@@ -1299,13 +1306,13 @@ def cross(x, y, axis=9, name=None):
     """
     if in_dygraph_mode():
         axis = K_DEFAULT_DIM if axis is None else axis
-        return _C_ops.final_state_cross(x, y, axis)
+        return _C_ops.cross(x, y, axis)
     else:
         if _in_legacy_dygraph():
             if axis is not None:
-                return _C_ops.cross(x, y, 'dim', axis)
+                return _legacy_C_ops.cross(x, y, 'dim', axis)
             else:
-                return _C_ops.cross(x, y)
+                return _legacy_C_ops.cross(x, y)
         else:
             helper = LayerHelper("cross", **locals())
             out = helper.create_variable_for_type_inference(x.dtype)
@@ -1362,10 +1369,10 @@ def cholesky(x, upper=False, name=None):
 
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_cholesky(x, upper)
+        return _C_ops.cholesky(x, upper)
 
     if _in_legacy_dygraph():
-        return _C_ops.cholesky(x, "upper", upper)
+        return _legacy_C_ops.cholesky(x, "upper", upper)
 
     check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'cholesky')
     check_type(upper, 'upper', bool, 'cholesky')
@@ -1424,9 +1431,8 @@ def matrix_rank(x, tol=None, hermitian=False, name=None):
             else:
                 tol_tensor = tol
             use_default_tol = False
-            return _C_ops.final_state_matrix_rank_tol(x, tol_tensor,
-                                                      use_default_tol,
-                                                      hermitian)
+            return _C_ops.matrix_rank_tol(x, tol_tensor, use_default_tol,
+                                          hermitian)
 
         if tol is None:
             tol_attr = 0.0
@@ -1434,8 +1440,7 @@ def matrix_rank(x, tol=None, hermitian=False, name=None):
         else:
             tol_attr = float(tol)
             use_default_tol = False
-        return _C_ops.final_state_matrix_rank(x, tol_attr, use_default_tol,
-                                              hermitian)
+        return _C_ops.matrix_rank(x, tol_attr, use_default_tol, hermitian)
 
     if _in_legacy_dygraph():
         if tol is None:
@@ -1453,8 +1458,9 @@ def matrix_rank(x, tol=None, hermitian=False, name=None):
             tol_tensor = None
             tol_attr = float(tol)
             use_default_tol = False
-        return _C_ops.matrix_rank(x, tol_tensor, "tol", tol_attr, 'hermitian',
-                                  hermitian, 'use_default_tol', use_default_tol)
+        return _legacy_C_ops.matrix_rank(x, tol_tensor, "tol", tol_attr,
+                                         'hermitian', hermitian,
+                                         'use_default_tol', use_default_tol)
 
     inputs = {}
     attrs = {}
@@ -1539,10 +1545,10 @@ def bmm(x, y, name=None):
             .format(x_shape, y_shape))
 
     if in_dygraph_mode():
-        return _C_ops.final_state_bmm(x, y)
+        return _C_ops.bmm(x, y)
 
     if paddle.in_dynamic_mode():
-        return _C_ops.bmm(x, y)
+        return _legacy_C_ops.bmm(x, y)
 
     helper = LayerHelper('bmm', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -1576,10 +1582,11 @@ def histogram(input, bins=100, min=0, max=0, name=None):
             print(result) # [0, 2, 1, 0]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_histogram(input, bins, min, max)
+        return _C_ops.histogram(input, bins, min, max)
 
     if _in_legacy_dygraph():
-        return _C_ops.histogram(input, "bins", bins, "min", min, "max", max)
+        return _legacy_C_ops.histogram(input, "bins", bins, "min", min, "max",
+                                       max)
 
     helper = LayerHelper('histogram', **locals())
     check_variable_and_dtype(input, 'X',
@@ -1628,7 +1635,7 @@ def bincount(x, weights=None, minlength=0, name=None):
         raise TypeError("Elements in Input(x) should all be integers")
 
     if _non_static_mode():
-        return _C_ops.bincount(x, weights, "minlength", minlength)
+        return _legacy_C_ops.bincount(x, weights, "minlength", minlength)
 
     helper = LayerHelper('bincount', **locals())
 
@@ -1682,10 +1689,10 @@ def mv(x, vec, name=None):
             #        [14., 10.])
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_mv(x, vec)
+        return _C_ops.mv(x, vec)
     else:
         if _in_legacy_dygraph():
-            out = _C_ops.mv(x, vec)
+            out = _legacy_C_ops.mv(x, vec)
             return out
         else:
 
@@ -1743,10 +1750,10 @@ def det(x, name=None):
 
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_det(x)
+        return _C_ops.det(x)
 
     if _in_legacy_dygraph():
-        return _C_ops.determinant(x)
+        return _legacy_C_ops.determinant(x)
 
     check_dtype(x.dtype, 'Input', ['float32', 'float64'], 'det')
 
@@ -1802,10 +1809,10 @@ def slogdet(x, name=None):
 
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_slogdet(x)
+        return _C_ops.slogdet(x)
 
     elif paddle.in_dynamic_mode():
-        return _C_ops.slogdeterminant(x)
+        return _legacy_C_ops.slogdeterminant(x)
 
     check_dtype(x.dtype, 'Input', ['float32', 'float64'], 'slogdet')
 
@@ -1878,9 +1885,9 @@ def svd(x, full_matrices=False, name=None):
             #                  V * VH == I
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_svd(x, full_matrices)
+        return _C_ops.svd(x, full_matrices)
     if _in_legacy_dygraph():
-        return _C_ops.svd(x, 'full_matrices', full_matrices)
+        return _legacy_C_ops.svd(x, 'full_matrices', full_matrices)
     check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'svd')
     check_type(full_matrices, 'full_matrices', bool, 'svd')
     helper = LayerHelper('svd', **locals())
@@ -1958,10 +1965,10 @@ def matrix_power(x, n, name=None):
             #  [ 1.80555556 , -1.91666667 ,  0.44444444 ]]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_matrix_power(x, n)
+        return _C_ops.matrix_power(x, n)
 
     if _in_legacy_dygraph():
-        return _C_ops.matrix_power(x, "n", n)
+        return _legacy_C_ops.matrix_power(x, "n", n)
 
     check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'matrix_power')
     check_type(n, 'n', int, 'matrix_power')
@@ -2017,13 +2024,13 @@ def qr(x, mode="reduced", name=None):
             # one can verify : X = Q * R ;     
     """
     if in_dygraph_mode():
-        q, r = _C_ops.final_state_qr(x, mode)
+        q, r = _C_ops.qr(x, mode)
         if mode == "r":
             return r
         else:
             return q, r
     if _in_legacy_dygraph():
-        q, r = _C_ops.qr(x, 'mode', mode)
+        q, r = _legacy_C_ops.qr(x, 'mode', mode)
         if mode == "r":
             return r
         else:
@@ -2127,9 +2134,9 @@ def lu(x, pivot=True, get_infos=False, name=None):
     """
 
     if in_dygraph_mode():
-        lu, p, info = _C_ops.final_state_lu(x, pivot)
+        lu, p, info = _C_ops.lu(x, pivot)
     elif paddle.in_dynamic_mode():
-        lu, p, info = _C_ops.lu(x, 'pivot', pivot)
+        lu, p, info = _legacy_C_ops.lu(x, 'pivot', pivot)
     else:
         check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'lu')
         helper = LayerHelper('lu', **locals())
@@ -2224,13 +2231,12 @@ def lu_unpack(x, y, unpack_ludata=True, unpack_pivots=True, name=None):
     """
 
     if in_dygraph_mode():
-        P, L, U = _C_ops.final_state_lu_unpack(x, y, unpack_ludata,
-                                               unpack_pivots)
+        P, L, U = _C_ops.lu_unpack(x, y, unpack_ludata, unpack_pivots)
         return P, L, U
 
     if paddle.in_dynamic_mode():
-        P, L, U = _C_ops.lu_unpack(x, y, 'unpack_ludata', unpack_ludata,
-                                   'unpack_pivots', unpack_pivots)
+        P, L, U = _legacy_C_ops.lu_unpack(x, y, 'unpack_ludata', unpack_ludata,
+                                          'unpack_pivots', unpack_pivots)
         return P, L, U
 
     check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'lu_unpack')
@@ -2305,9 +2311,9 @@ def eig(x, name=None):
             #         (-0.21026087843552282+0j)])
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_eig(x)
+        return _C_ops.eig(x)
     elif paddle.in_dynamic_mode():
-        w, v = _C_ops.eig(x)
+        w, v = _legacy_C_ops.eig(x)
         return w, v
 
     check_variable_and_dtype(x, 'X',
@@ -2377,9 +2383,9 @@ def eigvals(x, name=None):
             .format(x_shape))
 
     if in_dygraph_mode():
-        return _C_ops.final_state_eigvals(x)
-    elif paddle.in_dynamic_mode():
         return _C_ops.eigvals(x)
+    elif paddle.in_dynamic_mode():
+        return _legacy_C_ops.eigvals(x)
 
     helper = LayerHelper('eigvals', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -2450,9 +2456,9 @@ def multi_dot(x, name=None):
 
     """
     if _in_legacy_dygraph():
-        return _C_ops.multi_dot(x)
+        return _legacy_C_ops.multi_dot(x)
     if in_dygraph_mode():
-        return _C_ops.final_state_multi_dot(x)
+        return _C_ops.multi_dot(x)
 
     check_type(x, 'x', (list, tuple), 'multi_dot')
     for id, item in enumerate(x):
@@ -2504,10 +2510,10 @@ def eigh(x, UPLO='L', name=None):
 
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_eigh(x, UPLO)
+        return _C_ops.eigh(x, UPLO)
 
     if _in_legacy_dygraph():
-        return _C_ops.eigh(x, 'UPLO', UPLO)
+        return _legacy_C_ops.eigh(x, 'UPLO', UPLO)
 
     def __check_input(x, UPLO):
         x_shape = list(x.shape)
@@ -2609,8 +2615,8 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
     if in_dygraph_mode():
         if not hermitian:
             # combine svd and matmul op
-            u, s, vt = _C_ops.final_state_svd(x, False)
-            max_singular_val = _C_ops.final_state_max(s, [-1], True)
+            u, s, vt = _C_ops.svd(x, False)
+            max_singular_val = _C_ops.max(s, [-1], True)
             rcond = paddle.to_tensor(rcond, dtype=x.dtype)
             cutoff = rcond * max_singular_val
             y = float('inf')
@@ -2622,20 +2628,20 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
             out1 = multiply(1 / s, cond_int)
             out2 = multiply(1 / y, cond_not_int)
             singular = add(out1, out2)
-            st = _C_ops.final_state_unsqueeze(singular, [-2])
+            st = _C_ops.unsqueeze(singular, [-2])
 
             dims = list(range(len(vt.shape)))
             perm = dims[:-2] + [dims[-1]] + [dims[-2]]
-            v = _C_ops.final_state_transpose(vt, perm)
+            v = _C_ops.transpose(vt, perm)
 
             out_1 = v * st
-            out_2 = _C_ops.final_state_matmul(out_1, u, False, True)
+            out_2 = _C_ops.matmul(out_1, u, False, True)
             return out_2
         else:
             # combine eigh and matmul op
-            s, u = _C_ops.final_state_eigh(x, 'UPLO')
+            s, u = _C_ops.eigh(x, 'UPLO')
             s_abs = paddle.abs(s)
-            max_singular_val = _C_ops.final_state_max(s_abs, [-1], True)
+            max_singular_val = _C_ops.max(s_abs, [-1], True)
             rcond = paddle.to_tensor(rcond, dtype=s.dtype)
             cutoff = rcond * max_singular_val
             y = float('inf')
@@ -2647,18 +2653,18 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
             out1 = multiply(1 / s, cond_int)
             out2 = multiply(1 / y, cond_not_int)
             singular = add(out1, out2)
-            st = _C_ops.final_state_unsqueeze(singular, [-2])
+            st = _C_ops.unsqueeze(singular, [-2])
 
             out_1 = u * st
-            u_conj = _C_ops.final_state_conj(u)
-            out_2 = _C_ops.final_state_matmul(out_1, u_conj, False, True)
+            u_conj = _C_ops.conj(u)
+            out_2 = _C_ops.matmul(out_1, u_conj, False, True)
             return out_2
 
     if _in_legacy_dygraph():
         if not hermitian:
             # combine svd and matmul op
-            u, s, vt = _C_ops.svd(x, 'full_matrices', False)
-            max_singular_val = _C_ops.reduce_max(s, 'dim', [-1], 'keep_dim', True, \
+            u, s, vt = _legacy_C_ops.svd(x, 'full_matrices', False)
+            max_singular_val = _legacy_C_ops.reduce_max(s, 'dim', [-1], 'keep_dim', True, \
                 'reduce_all', False)
             rcond = paddle.to_tensor(rcond, dtype=x.dtype)
             cutoff = rcond * max_singular_val
@@ -2671,24 +2677,24 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
             out1 = multiply(1 / s, cond_int)
             out2 = multiply(1 / y, cond_not_int)
             singular = add(out1, out2)
-            st, _ = _C_ops.unsqueeze2(singular, 'axes', [-2])
+            st, _ = _legacy_C_ops.unsqueeze2(singular, 'axes', [-2])
 
             dims = list(range(len(vt.shape)))
             perm = dims[:-2] + [dims[-1]] + [dims[-2]]
-            v, _ = _C_ops.transpose2(vt, 'axis', perm)
+            v, _ = _legacy_C_ops.transpose2(vt, 'axis', perm)
 
             out_1 = v * st
             if in_dygraph_mode():
-                out_2 = _C_ops.final_state_matmul(out_1, u, False, True)
+                out_2 = _C_ops.matmul(out_1, u, False, True)
             else:
-                out_2 = _C_ops.matmul_v2(out_1, u, 'trans_x', False, 'trans_y',
-                                         True)
+                out_2 = _legacy_C_ops.matmul_v2(out_1, u, 'trans_x', False,
+                                                'trans_y', True)
             return out_2
         else:
             # combine eigh and matmul op
-            s, u = _C_ops.eigh(x, 'UPLO', 'L')
+            s, u = _legacy_C_ops.eigh(x, 'UPLO', 'L')
             s_abs = paddle.abs(s)
-            max_singular_val = _C_ops.reduce_max(s_abs, 'dim', [-1], 'keep_dim', True, \
+            max_singular_val = _legacy_C_ops.reduce_max(s_abs, 'dim', [-1], 'keep_dim', True, \
                 'reduce_all', False)
             rcond = paddle.to_tensor(rcond, dtype=s.dtype)
             cutoff = rcond * max_singular_val
@@ -2701,15 +2707,15 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
             out1 = multiply(1 / s, cond_int)
             out2 = multiply(1 / y, cond_not_int)
             singular = add(out1, out2)
-            st, _ = _C_ops.unsqueeze2(singular, 'axes', [-2])
+            st, _ = _legacy_C_ops.unsqueeze2(singular, 'axes', [-2])
 
             out_1 = u * st
-            u_conj = _C_ops.conj(u)
+            u_conj = _legacy_C_ops.conj(u)
             if in_dygraph_mode():
-                out_2 = _C_ops.final_state_matmul(out_1, u_conj, False, True)
+                out_2 = _C_ops.matmul(out_1, u_conj, False, True)
             else:
-                out_2 = _C_ops.matmul_v2(out_1, u_conj, 'trans_x', False,
-                                         'trans_y', True)
+                out_2 = _legacy_C_ops.matmul_v2(out_1, u_conj, 'trans_x', False,
+                                                'trans_y', True)
             return out_2
     else:
         if not hermitian:
@@ -2938,10 +2944,10 @@ def solve(x, y, name=None):
         # [2., 3.])
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_solve(x, y)
+        return _C_ops.solve(x, y)
 
     if _in_legacy_dygraph():
-        return _C_ops.solve(x, y)
+        return _legacy_C_ops.solve(x, y)
 
     inputs = {"X": [x], "Y": [y]}
     helper = LayerHelper("solve", **locals())
@@ -3008,13 +3014,12 @@ def triangular_solve(x,
         # [7, -2, -5]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_triangular_solve(x, y, upper, transpose,
-                                                   unitriangular)
+        return _C_ops.triangular_solve(x, y, upper, transpose, unitriangular)
 
     if paddle.in_dynamic_mode():
-        return _C_ops.triangular_solve(x, y, 'upper', upper, 'transpose',
-                                       transpose, 'unitriangular',
-                                       unitriangular)
+        return _legacy_C_ops.triangular_solve(x, y, 'upper', upper, 'transpose',
+                                              transpose, 'unitriangular',
+                                              unitriangular)
 
     inputs = {"X": [x], "Y": [y]}
     helper = LayerHelper("triangular_solve", **locals())
@@ -3070,10 +3075,10 @@ def cholesky_solve(x, y, upper=False, name=None):
         # [-2.5, -7, 9.5]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_cholesky_solve(x, y, upper)
+        return _C_ops.cholesky_solve(x, y, upper)
 
     if _in_legacy_dygraph():
-        return _C_ops.cholesky_solve(x, y, 'upper', upper)
+        return _legacy_C_ops.cholesky_solve(x, y, 'upper', upper)
 
     helper = LayerHelper("cholesky_solve", **locals())
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'cholesky_solve')
@@ -3118,12 +3123,12 @@ def eigvalsh(x, UPLO='L', name=None):
             #[0.17157288, 5.82842712]
     """
     if in_dygraph_mode():
-        values, _ = _C_ops.final_state_eigvalsh(x, UPLO, x.stop_gradient)
+        values, _ = _C_ops.eigvalsh(x, UPLO, x.stop_gradient)
         return values
 
     elif paddle.in_dynamic_mode():
         is_test = x.stop_gradient
-        values, _ = _C_ops.eigvalsh(x, 'UPLO', UPLO, 'is_test', is_test)
+        values, _ = _legacy_C_ops.eigvalsh(x, 'UPLO', UPLO, 'is_test', is_test)
         return values
 
     def __check_input(x, UPLO):
@@ -3254,10 +3259,10 @@ def lstsq(x, y, rcond=None, driver=None, name=None):
 
     if _non_static_mode():
         if in_dygraph_mode():
-            solution, residuals, rank, singular_values = _C_ops.final_state_lstsq(
+            solution, residuals, rank, singular_values = _C_ops.lstsq(
                 x, y, rcond, driver)
         else:
-            solution, residuals, rank, singular_values = _C_ops.lstsq(
+            solution, residuals, rank, singular_values = _legacy_C_ops.lstsq(
                 x, y, 'rcond', rcond, 'driver', driver)
 
         if driver == "gels":
diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py
index 8834ae1d400f1..63a8932750567 100755
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -27,19 +27,25 @@
 from ..framework import LayerHelper
 from ..fluid.framework import _in_legacy_dygraph
 # TODO: define logic functions of a tensor
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.tensor.creation import full
 
 __all__ = []
 
 
 def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
-    if _non_static_mode():
+    if in_dygraph_mode():
         op = getattr(_C_ops, op_name)
         if binary_op:
             return op(x, y)
         else:
             return op(x)
+    elif _in_legacy_dygraph():
+        op = getattr(_legacy_C_ops, op_name)
+        if binary_op:
+            return op(x, y)
+        else:
+            return op(x)
     check_variable_and_dtype(
         x, "x",
         ["bool", "int8", "int16", "int32", "int64", "float32", "float64"],
@@ -108,7 +114,7 @@ def logical_and(x, y, out=None, name=None):
             print(res) # [True False True False]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_logical_and(x, y)
+        return _C_ops.logical_and(x, y)
 
     return _logical_op(op_name="logical_and",
                        x=x,
@@ -154,7 +160,7 @@ def logical_or(x, y, out=None, name=None):
             print(res) # [[ True  True] [ True False]]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_logical_or(x, y)
+        return _C_ops.logical_or(x, y)
     return _logical_op(op_name="logical_or",
                        x=x,
                        y=y,
@@ -199,7 +205,7 @@ def logical_xor(x, y, out=None, name=None):
             print(res) # [[False,  True], [ True, False]]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_logical_xor(x, y)
+        return _C_ops.logical_xor(x, y)
 
     return _logical_op(op_name="logical_xor",
                        x=x,
@@ -238,7 +244,7 @@ def logical_not(x, out=None, name=None):
             print(res) # [False  True False  True]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_logical_not(x)
+        return _C_ops.logical_not(x)
     return _logical_op(op_name="logical_not",
                        x=x,
                        y=None,
@@ -278,9 +284,9 @@ def is_empty(x, name=None):
 
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_is_empty(x)
-    if _in_legacy_dygraph():
         return _C_ops.is_empty(x)
+    if _in_legacy_dygraph():
+        return _legacy_C_ops.is_empty(x)
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
                              'is_empty')
@@ -325,10 +331,10 @@ def equal_all(x, y, name=None):
           print(result2) # result2 = [False ]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_equal_all(x, y)
+        return _C_ops.equal_all(x, y)
 
     if paddle.in_dynamic_mode():
-        return _C_ops.equal_all(x, y)
+        return _legacy_C_ops.equal_all(x, y)
 
     helper = LayerHelper("equal_all", **locals())
     out = helper.create_variable_for_type_inference(dtype='bool')
@@ -391,11 +397,11 @@ def allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
         # C++ backend will cast it into float32 if passing float from python.
         as_tensor = lambda x: paddle.to_tensor(
             [x], dtype='float64', place='cpu')
-        return _C_ops.final_state_allclose(x, y, as_tensor(rtol),
-                                           as_tensor(atol), equal_nan)
+        return _C_ops.allclose(x, y, as_tensor(rtol), as_tensor(atol),
+                               equal_nan)
     if _in_legacy_dygraph():
-        return _C_ops.allclose(x, y, 'rtol', str(rtol), 'atol', str(atol),
-                               'equal_nan', equal_nan)
+        return _legacy_C_ops.allclose(x, y, 'rtol', str(rtol), 'atol',
+                                      str(atol), 'equal_nan', equal_nan)
     check_variable_and_dtype(x, "input", ['float32', 'float64'], 'allclose')
     check_variable_and_dtype(y, "input", ['float32', 'float64'], 'allclose')
     check_type(rtol, 'rtol', float, 'allclose')
@@ -454,10 +460,10 @@ def equal(x, y, name=None):
 
     if in_dygraph_mode():
         default_axis = -1
-        return _C_ops.final_state_equal(x, y, default_axis)
+        return _C_ops.equal(x, y, default_axis)
     else:
         if _in_legacy_dygraph():
-            return _C_ops.equal(x, y)
+            return _legacy_C_ops.equal(x, y)
         else:
             check_variable_and_dtype(
                 x, "x", ["bool", "float32", "float64", "int32", "int64"],
@@ -506,10 +512,10 @@ def greater_equal(x, y, name=None):
     """
     if in_dygraph_mode():
         default_axis = -1
-        return _C_ops.final_state_greater_equal(x, y, default_axis)
+        return _C_ops.greater_equal(x, y, default_axis)
     else:
         if _in_legacy_dygraph():
-            return _C_ops.greater_equal(x, y)
+            return _legacy_C_ops.greater_equal(x, y)
         else:
             check_variable_and_dtype(
                 x, "x", ["bool", "float32", "float64", "int32", "int64"],
@@ -557,10 +563,10 @@ def greater_than(x, y, name=None):
             print(result1)  # result1 = [False False True]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_greater_than(x, y, -1)
+        return _C_ops.greater_than(x, y, -1)
     else:
         if _in_legacy_dygraph():
-            return _C_ops.greater_than(x, y)
+            return _legacy_C_ops.greater_than(x, y)
         else:
             check_variable_and_dtype(
                 x, "x", ["bool", "float32", "float64", "int32", "int64"],
@@ -610,10 +616,10 @@ def less_equal(x, y, name=None):
     """
     if in_dygraph_mode():
         axis = -1
-        return _C_ops.final_state_less_equal(x, y, axis)
+        return _C_ops.less_equal(x, y, axis)
     else:
         if _in_legacy_dygraph():
-            return _C_ops.less_equal(x, y)
+            return _legacy_C_ops.less_equal(x, y)
         else:
             check_variable_and_dtype(
                 x, "x", ["bool", "float32", "float64", "int32", "int64"],
@@ -663,10 +669,10 @@ def less_than(x, y, name=None):
     """
     if in_dygraph_mode():
         default_axis = -1
-        return _C_ops.final_state_less_than(x, y, default_axis)
+        return _C_ops.less_than(x, y, default_axis)
     else:
         if _in_legacy_dygraph():
-            return _C_ops.less_than(x, y)
+            return _legacy_C_ops.less_than(x, y)
         else:
             check_variable_and_dtype(
                 x, "x", ["bool", "float32", "float64", "int32", "int64"],
@@ -716,10 +722,10 @@ def not_equal(x, y, name=None):
     """
     if in_dygraph_mode():
         axis = -1
-        return _C_ops.final_state_not_equal(x, y, axis)
+        return _C_ops.not_equal(x, y, axis)
     else:
         if _in_legacy_dygraph():
-            return _C_ops.not_equal(x, y)
+            return _legacy_C_ops.not_equal(x, y)
         else:
             check_variable_and_dtype(
                 x, "x", ["bool", "float32", "float64", "int32", "int64"],
@@ -769,12 +775,18 @@ def is_tensor(x):
 
 
 def _bitwise_op(op_name, x, y, out=None, name=None, binary_op=True):
-    if paddle.in_dynamic_mode():
+    if in_dygraph_mode():
         op = getattr(_C_ops, op_name)
         if binary_op:
             return op(x, y)
         else:
             return op(x)
+    elif _in_legacy_dygraph():
+        op = getattr(_legacy_C_ops, op_name)
+        if binary_op:
+            return op(x, y)
+        else:
+            return op(x)
 
     check_variable_and_dtype(
         x, "x", ["bool", "uint8", "int8", "int16", "int32", "int64"], op_name)
@@ -828,7 +840,7 @@ def bitwise_and(x, y, out=None, name=None):
             print(res)  # [0, 2, 1]
     """
     if in_dygraph_mode() and out is None:
-        return _C_ops.final_state_bitwise_and(x, y)
+        return _C_ops.bitwise_and(x, y)
     return _bitwise_op(op_name="bitwise_and",
                        x=x,
                        y=y,
@@ -860,7 +872,7 @@ def bitwise_or(x, y, out=None, name=None):
             print(res)  # [-1, -1, -3]
     """
     if in_dygraph_mode() and out is None:
-        return _C_ops.final_state_bitwise_or(x, y)
+        return _C_ops.bitwise_or(x, y)
 
     return _bitwise_op(op_name="bitwise_or",
                        x=x,
@@ -893,7 +905,7 @@ def bitwise_xor(x, y, out=None, name=None):
             print(res) # [-1, -3, -4]
     """
     if in_dygraph_mode() and out is None:
-        return _C_ops.final_state_bitwise_xor(x, y)
+        return _C_ops.bitwise_xor(x, y)
     return _bitwise_op(op_name="bitwise_xor",
                        x=x,
                        y=y,
@@ -923,7 +935,7 @@ def bitwise_not(x, out=None, name=None):
             print(res) # [4, 0, -2]
     """
     if in_dygraph_mode() and out is None:
-        return _C_ops.final_state_bitwise_not(x)
+        return _C_ops.bitwise_not(x)
 
     return _bitwise_op(op_name="bitwise_not",
                        x=x,
@@ -990,11 +1002,10 @@ def isclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
         # C++ backend will cast it into float32 if passing float from python.
         as_tensor = lambda x: paddle.to_tensor(
             [x], dtype='float64', place='cpu')
-        return _C_ops.final_state_isclose(x, y, as_tensor(rtol),
-                                          as_tensor(atol), equal_nan)
+        return _C_ops.isclose(x, y, as_tensor(rtol), as_tensor(atol), equal_nan)
     if _in_legacy_dygraph():
-        return _C_ops.isclose(x, y, 'rtol', str(rtol), 'atol', str(atol),
-                              'equal_nan', equal_nan)
+        return _legacy_C_ops.isclose(x, y, 'rtol', str(rtol), 'atol', str(atol),
+                                     'equal_nan', equal_nan)
 
     check_variable_and_dtype(x, "input", ['float32', 'float64'], 'isclose')
     check_variable_and_dtype(y, "input", ['float32', 'float64'], 'isclose')
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 238fdef009db3..7b90eaa9206f6 100755
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -27,7 +27,7 @@
 from ..fluid.layers.nn import _elementwise_op_in_dygraph
 from ..fluid.dygraph.inplace_utils import inplace_apis_in_dygraph_only
 import paddle
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from ..common_ops_import import dygraph_utils, fill_constant, _varbase_creator
 import warnings
 from .creation import zeros
@@ -64,12 +64,12 @@ def cast(x, dtype):
     if in_dygraph_mode():
         if not isinstance(dtype, core.VarDesc.VarType):
             dtype = convert_np_dtype_to_dtype_(dtype)
-        return _C_ops.final_state_cast(x, dtype)
+        return _C_ops.cast(x, dtype)
 
     if _non_static_mode():
         if not isinstance(dtype, core.VarDesc.VarType):
             dtype = convert_np_dtype_to_dtype_(dtype)
-        out = _C_ops.cast(x, 'in_dtype', x.dtype, 'out_dtype', dtype)
+        out = _legacy_C_ops.cast(x, 'in_dtype', x.dtype, 'out_dtype', dtype)
         return out
 
     check_variable_and_dtype(x, 'x', [
@@ -212,8 +212,7 @@ def slice(input, axes, starts, ends):
             ends = [ele for ele in tensor_t]
             infer_flags = list(-1 for i in range(len(axes)))
 
-        return _C_ops.final_state_slice(input, axes, starts, ends, infer_flags,
-                                        [])
+        return _C_ops.slice(input, axes, starts, ends, infer_flags, [])
     else:
         if _in_legacy_dygraph():
             attrs = ()
@@ -264,9 +263,9 @@ def slice(input, axes, starts, ends):
                 ends_tensor.stop_gradient = True
                 infer_flags = list(-1 for i in range(len(axes)))
 
-            return _C_ops.slice(input, starts_tensor, ends_tensor, None, None,
-                                'axes', axes, 'infer_flags', infer_flags,
-                                *attrs)
+            return _legacy_C_ops.slice(input, starts_tensor, ends_tensor, None,
+                                       None, 'axes', axes, 'infer_flags',
+                                       infer_flags, *attrs)
 
     if not isinstance(starts, (list, tuple, Variable)):
         raise ValueError(
@@ -380,10 +379,10 @@ def transpose(x, perm, name=None):
 
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_transpose(x, perm)
+        return _C_ops.transpose(x, perm)
     else:
         if _in_legacy_dygraph():
-            out, _ = _C_ops.transpose2(x, 'axis', perm)
+            out, _ = _legacy_C_ops.transpose2(x, 'axis', perm)
             return out
 
     check_variable_and_dtype(x, 'x', [
@@ -458,14 +457,14 @@ def unstack(x, axis=0, num=None):
             num = x.shape[axis]
         if num == 0:
             return []
-        return _C_ops.final_state_unstack(x, axis, num)
+        return _C_ops.unstack(x, axis, num)
 
     if _non_static_mode():
         if num == None:
             num = x.shape[axis]
         if num == 0:
             return []
-        return _C_ops.unstack(x, num, 'axis', int(axis), 'num', num)
+        return _legacy_C_ops.unstack(x, num, 'axis', int(axis), 'num', num)
 
     helper = LayerHelper('unstack', **locals())
     if num is None:
@@ -535,8 +534,8 @@ def shard_index(input, index_num, nshards, shard_id, ignore_value=-1):
             # [[-1], [1]]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_shard_index(input, index_num, nshards,
-                                              shard_id, ignore_value)
+        return _C_ops.shard_index(input, index_num, nshards, shard_id,
+                                  ignore_value)
 
     check_variable_and_dtype(input, 'input', ['int64', 'int32'], 'shard_index')
     op_type = 'shard_index'
@@ -657,7 +656,7 @@ def crop(x, shape=None, offsets=None, name=None):
         offsets = [0] * len(x.shape)
 
     if in_dygraph_mode():
-        return _C_ops.final_state_crop_tensor(x, shape, offsets)
+        return _C_ops.crop_tensor(x, shape, offsets)
 
     out = helper.create_variable_for_type_inference(x.dtype)
     ipts = {'X': x}
@@ -778,10 +777,10 @@ def fill_(x, value):
             "The type of 'value'  must be int or float, but received %s." %
             (type(value)))
     if in_dygraph_mode():
-        return _C_ops.final_state_fill_(x, value)
+        return _C_ops.fill_(x, value)
     else:
-        return _C_ops.fill_any_(x, "value_float", float(value), "value_int",
-                                int(value))
+        return _legacy_C_ops.fill_any_(x, "value_float", float(value),
+                                       "value_int", int(value))
 
 
 @dygraph_only
@@ -810,9 +809,10 @@ def zero_(x):
 
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_fill_(x, 0.)
+        return _C_ops.fill_(x, 0.)
     else:
-        return _C_ops.fill_any_(x, "value_float", 0., "value_int", int(0))
+        return _legacy_C_ops.fill_any_(x, "value_float", 0., "value_int",
+                                       int(0))
 
 
 @dygraph_only
@@ -859,14 +859,14 @@ def fill_diagonal_(x, value, offset=0, wrap=False, name=None):
         )
     if in_dygraph_mode():
         if len(inshape) == 2:
-            return _C_ops.final_state_fill_diagonal_(x, value, offset, wrap)
-        return _C_ops.final_state_fill_diagonal_(x, value, offset, True)
+            return _C_ops.fill_diagonal_(x, value, offset, wrap)
+        return _C_ops.fill_diagonal_(x, value, offset, True)
 
     if len(inshape) == 2:
-        return _C_ops.fill_diagonal_(x, 'value', value, 'offset', offset,
-                                     'wrap', wrap)
-    return _C_ops.fill_diagonal_(x, 'value', value, 'offset', offset, 'wrap',
-                                 True)
+        return _legacy_C_ops.fill_diagonal_(x, 'value', value, 'offset', offset,
+                                            'wrap', wrap)
+    return _legacy_C_ops.fill_diagonal_(x, 'value', value, 'offset', offset,
+                                        'wrap', True)
 
 
 def _fill_diagonal_tensor_impl(x, y, offset=0, dim1=0, dim2=1, inplace=False):
@@ -894,16 +894,16 @@ def _fill_diagonal_tensor_impl(x, y, offset=0, dim1=0, dim2=1, inplace=False):
 
     if inplace:
         if in_dygraph_mode():
-            return _C_ops.final_state_fill_diagonal_tensor_(
-                x, y, offset, dim1, dim2)
+            return _C_ops.fill_diagonal_tensor_(x, y, offset, dim1, dim2)
         else:
-            return _C_ops.fill_diagonal_tensor_(x, y, 'offset', offset, 'dim1',
-                                                dim1, 'dim2', dim2)
+            return _legacy_C_ops.fill_diagonal_tensor_(x, y, 'offset', offset,
+                                                       'dim1', dim1, 'dim2',
+                                                       dim2)
     if in_dygraph_mode():
-        return _C_ops.final_state_fill_diagonal_tensor(x, y, offset, dim1, dim2)
+        return _C_ops.fill_diagonal_tensor(x, y, offset, dim1, dim2)
     else:
-        return _C_ops.fill_diagonal_tensor(x, y, 'offset', offset, 'dim1', dim1,
-                                           'dim2', dim2)
+        return _legacy_C_ops.fill_diagonal_tensor(x, y, 'offset', offset,
+                                                  'dim1', dim1, 'dim2', dim2)
 
 
 def fill_diagonal_tensor_(x, y, offset=0, dim1=0, dim2=1, name=None):
@@ -1058,7 +1058,7 @@ def concat(x, axis=0, name=None):
             axis = axis.item(0)
         if not isinstance(input, Variable):
             input = [t for t in input if t.shape.count(0) == 0]
-        return _C_ops.final_state_concat(input, axis)
+        return _C_ops.concat(input, axis)
 
     if _in_legacy_dygraph():
         if isinstance(axis, Variable):
@@ -1067,7 +1067,7 @@ def concat(x, axis=0, name=None):
         if not isinstance(input, Variable):
             input = [t for t in input if t.shape.count(0) == 0]
         out = _varbase_creator()
-        _C_ops.concat(input, out, 'axis', axis)
+        _legacy_C_ops.concat(input, out, 'axis', axis)
         return out
 
     check_type(input, 'input', (list, tuple, Variable), 'concat')
@@ -1157,9 +1157,9 @@ def broadcast_tensors(input, name=None):
 
     num_inputs = len(input)
     if paddle.framework.in_dygraph_mode():
-        return _C_ops.final_state_broadcast_tensors(input)
+        return _C_ops.broadcast_tensors(input)
     if paddle.framework._non_static_mode():
-        return _C_ops.broadcast_tensors(input, num_inputs)
+        return _legacy_C_ops.broadcast_tensors(input, num_inputs)
 
     check_type(input, 'input', (list, tuple), 'broadcast_tensors')
     if num_inputs < 1:
@@ -1257,10 +1257,10 @@ def flip(x, axis, name=None):
         axis = [axis]
 
     if in_dygraph_mode():
-        return _C_ops.final_state_flip(x, axis)
+        return _C_ops.flip(x, axis)
 
     if paddle.in_dynamic_mode():
-        return _C_ops.flip(x, "axis", axis)
+        return _legacy_C_ops.flip(x, "axis", axis)
 
     helper = LayerHelper("flip", **locals())
     check_type(x, 'X', (Variable), 'flip')
@@ -1476,11 +1476,11 @@ def flatten(x, start_axis=0, stop_axis=-1, name=None):
         raise ValueError("The stop_axis should be larger than stat_axis")
 
     if in_dygraph_mode():
-        return _C_ops.final_state_flatten(x, start_axis, stop_axis)
+        return _C_ops.flatten(x, start_axis, stop_axis)
 
     if _in_legacy_dygraph():
-        dy_out, _ = _C_ops.flatten_contiguous_range(x, 'start_axis', start_axis,
-                                                    'stop_axis', stop_axis)
+        dy_out, _ = _legacy_C_ops.flatten_contiguous_range(
+            x, 'start_axis', start_axis, 'stop_axis', stop_axis)
         return dy_out
 
     helper = LayerHelper('flatten', **locals())
@@ -1525,12 +1525,11 @@ def flatten_(x, start_axis=0, stop_axis=-1, name=None):
         raise ValueError("The stop_axis should be larger than stat_axis")
 
     if in_dygraph_mode():
-        return _C_ops.final_state_flatten_(x, start_axis, stop_axis)
+        return _C_ops.flatten_(x, start_axis, stop_axis)
 
     if _in_legacy_dygraph():
-        dy_out, _ = _C_ops.flatten_contiguous_range_(x, 'start_axis',
-                                                     start_axis, 'stop_axis',
-                                                     stop_axis)
+        dy_out, _ = _legacy_C_ops.flatten_contiguous_range_(
+            x, 'start_axis', start_axis, 'stop_axis', stop_axis)
         return dy_out
 
 
@@ -1594,10 +1593,10 @@ def roll(x, shifts, axis=None, name=None):
         axis = []
 
     if in_dygraph_mode():
-        return _C_ops.final_state_roll(x, shifts, axis)
+        return _C_ops.roll(x, shifts, axis)
 
     if _in_legacy_dygraph():
-        return _C_ops.roll(x, 'axis', axis, 'shifts', shifts)
+        return _legacy_C_ops.roll(x, 'axis', axis, 'shifts', shifts)
 
     helper = LayerHelper("roll", **locals())
     check_type(axis, 'axis', (list, tuple), 'roll')
@@ -1713,10 +1712,10 @@ def stack(x, axis=0, name=None):
     axis = 0 if axis is None else axis
 
     if in_dygraph_mode():
-        return _C_ops.final_state_stack(x, axis)
+        return _C_ops.stack(x, axis)
 
     if _in_legacy_dygraph():
-        return _C_ops.stack(x, 'axis', axis)
+        return _legacy_C_ops.stack(x, 'axis', axis)
 
     if not isinstance(x, list) and not isinstance(x, tuple):
         # NOTE:(zhiqiu) Only support Variable as input if the Variable is a LOD_TENSOR_ARRAY create by create_array, array_write, array_read, etc.
@@ -1840,12 +1839,11 @@ def split(x, num_or_sections, axis=0, name=None):
                 "The type of 'num_or_sections' in split must be int, list or tuple in imperative mode, but "
                 "received %s." % (type(num_or_sections)))
         if in_dygraph_mode():
-            return _C_ops.final_state_split(
-                input, [num_or_sections]
-                if isinstance(num_or_sections, int) else num_or_sections, dim)
+            return _C_ops.split(input, [num_or_sections] if isinstance(
+                num_or_sections, int) else num_or_sections, dim)
         elif _in_legacy_dygraph():
             out = [_varbase_creator() for n in range(num)]
-            _C_ops.split(input, out, *attrs)
+            _legacy_C_ops.split(input, out, *attrs)
             return out
 
     check_variable_and_dtype(input, 'input', [
@@ -2009,9 +2007,9 @@ def squeeze(x, axis=None, name=None):
     input = x
     axes = axis
     if in_dygraph_mode():
-        return _C_ops.final_state_squeeze(input, axes)
+        return _C_ops.squeeze(input, axes)
     if _in_legacy_dygraph():
-        out, _ = _C_ops.squeeze2(input, 'axes', axes)
+        out, _ = _legacy_C_ops.squeeze2(input, 'axes', axes)
         return out
 
     helper = LayerHelper("squeeze", **locals())
@@ -2060,9 +2058,9 @@ def squeeze_(x, axis=None, name=None):
     input = x
     axes = axis
     if in_dygraph_mode():
-        return _C_ops.final_state_squeeze_(input, axes)
+        return _C_ops.squeeze_(input, axes)
     if _in_legacy_dygraph():
-        out, _ = _C_ops.squeeze2_(input, 'axes', axes)
+        out, _ = _legacy_C_ops.squeeze2_(input, 'axes', axes)
         return out
 
 
@@ -2124,7 +2122,7 @@ def unique_consecutive(x,
         axis = [axis]
     attr_dtype = convert_np_dtype_to_dtype_(dtype)
     if in_dygraph_mode():
-        out, inverse, counts = _C_ops.final_state_unique_consecutive(
+        out, inverse, counts = _C_ops.unique_consecutive(
             x, return_inverse, return_counts, axis, attr_dtype)
         outs = [out]
         if return_inverse:
@@ -2135,7 +2133,7 @@ def unique_consecutive(x,
             return outs[0]
         return tuple(outs)
     elif paddle.in_dynamic_mode():
-        out, inverse, counts = _C_ops.unique_consecutive(
+        out, inverse, counts = _legacy_C_ops.unique_consecutive(
             x, 'dtype', attr_dtype, 'return_inverse', return_inverse,
             'return_counts', return_counts, 'axis', axis)
         outs = [out]
@@ -2240,11 +2238,11 @@ def unique(x,
     attr_dtype = convert_np_dtype_to_dtype_(dtype)
     if _non_static_mode():
         if in_dygraph_mode():
-            out, indices, inverse, counts = _C_ops.final_state_unique(
+            out, indices, inverse, counts = _C_ops.unique(
                 x, return_index, return_inverse, return_counts, axis,
                 attr_dtype)
         if _in_legacy_dygraph():
-            out, inverse, indices, counts = _C_ops.unique(
+            out, inverse, indices, counts = _legacy_C_ops.unique(
                 x, 'dtype', attr_dtype, 'return_index', return_index,
                 'return_inverse', return_inverse, 'return_counts',
                 return_counts, 'axis', axis, "is_sorted", True)
@@ -2371,9 +2369,9 @@ def unsqueeze(x, axis, name=None):
                 for item in axes
             ]
         if _in_legacy_dygraph():
-            out, _ = _C_ops.unsqueeze2(input, 'axes', axes)
+            out, _ = _legacy_C_ops.unsqueeze2(input, 'axes', axes)
             return out
-        return _C_ops.final_state_unsqueeze(input, axes)
+        return _C_ops.unsqueeze(input, axes)
 
     check_type(axes, 'axis/axes', (int, list, tuple, Variable), 'unsqueeze')
     check_variable_and_dtype(input, 'input', [
@@ -2434,8 +2432,8 @@ def unsqueeze_(x, axis, name=None):
             for item in axes
         ]
     if in_dygraph_mode():
-        return _C_ops.final_state_unsqueeze_(input, axes)
-    out, _ = _C_ops.unsqueeze2_(input, 'axes', axes)
+        return _C_ops.unsqueeze_(input, axes)
+    out, _ = _legacy_C_ops.unsqueeze2_(input, 'axes', axes)
     return out
 
 
@@ -2488,10 +2486,11 @@ def gather(x, index, axis=None, name=None):
         axis = 0
 
     if in_dygraph_mode():
-        return _C_ops.final_state_gather(x, index, axis)
+        return _C_ops.gather(x, index, axis)
     if _in_legacy_dygraph():
         axis = axis.item() if isinstance(axis, paddle.Tensor) else axis
-        return _C_ops.gather(x, index, None, "axis", axis, "overwrite", False)
+        return _legacy_C_ops.gather(x, index, None, "axis", axis, "overwrite",
+                                    False)
 
     check_variable_and_dtype(
         x, 'x',
@@ -2561,7 +2560,7 @@ def unbind(input, axis=0):
             # x3.shape [3, 5]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_unbind(input, axis)
+        return _C_ops.unbind(input, axis)
 
     if not isinstance(axis, (int)):
         raise TypeError("The type of 'axis'  must be int, but received %s." %
@@ -2572,7 +2571,7 @@ def unbind(input, axis=0):
     axis_ = axis if axis >= 0 else len(input_shape) + axis
     num = input_shape[axis_]
     if _in_legacy_dygraph():
-        return _C_ops.unbind(input, num, 'axis', axis)
+        return _legacy_C_ops.unbind(input, num, 'axis', axis)
 
     helper = LayerHelper("unbind", **locals())
     check_type(input, 'input', (Variable), 'unbind')
@@ -2665,10 +2664,11 @@ def scatter(x, index, updates, overwrite=True, name=None):
             #  [1., 1.]]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_scatter(x, index, updates, overwrite)
+        return _C_ops.scatter(x, index, updates, overwrite)
     else:
         if _in_legacy_dygraph():
-            return _C_ops.scatter(x, index, updates, 'overwrite', overwrite)
+            return _legacy_C_ops.scatter(x, index, updates, 'overwrite',
+                                         overwrite)
         else:
             check_variable_and_dtype(
                 x, 'dtype', ['float32', 'float64', 'float16', 'int32', 'int64'],
@@ -2694,8 +2694,8 @@ def scatter_(x, index, updates, overwrite=True, name=None):
     Please refer to :ref:`api_paddle_tensor_scatter`.
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_scatter_(x, index, updates, overwrite)
-    return _C_ops.scatter_(x, index, updates, 'overwrite', overwrite)
+        return _C_ops.scatter_(x, index, updates, overwrite)
+    return _legacy_C_ops.scatter_(x, index, updates, 'overwrite', overwrite)
 
 
 def scatter_nd_add(x, index, updates, name=None):
@@ -2768,10 +2768,10 @@ def scatter_nd_add(x, index, updates, name=None):
             # [3, 5, 9, 10]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_scatter_nd_add(x, index, updates)
+        return _C_ops.scatter_nd_add(x, index, updates)
     else:
         if _in_legacy_dygraph():
-            op = getattr(_C_ops, 'scatter_nd_add')
+            op = getattr(_legacy_C_ops, 'scatter_nd_add')
             return op(x, index, updates)
         else:
             if x.dtype != updates.dtype:
@@ -2918,10 +2918,10 @@ def tile(x, repeat_times, name=None):
             assert repeat_times.ndim == 1, "Only support ndim == 1 while repeat_times is a Tensor."
             repeat_times = repeat_times.numpy().tolist()
 
-        return _C_ops.final_state_tile(x, repeat_times)
+        return _C_ops.tile(x, repeat_times)
 
     if _in_legacy_dygraph():
-        return _C_ops.tile(x, 'repeat_times', repeat_times)
+        return _legacy_C_ops.tile(x, 'repeat_times', repeat_times)
 
     check_type(repeat_times, 'repeat_times', (list, tuple, Variable), 'tile')
     if isinstance(repeat_times, Variable):
@@ -3008,10 +3008,10 @@ def expand_as(x, y, name=None):
             # [[1, 2, 3], [1, 2, 3]]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_expand_as(x, None, y.shape)
+        return _C_ops.expand_as(x, None, y.shape)
 
     if _non_static_mode():
-        return _C_ops.expand_as_v2(x, 'target_shape', y.shape)
+        return _legacy_C_ops.expand_as_v2(x, 'target_shape', y.shape)
 
     check_variable_and_dtype(x, 'x',
                              ['bool', 'float32', 'float64', 'int32', 'int64'],
@@ -3064,9 +3064,9 @@ def broadcast_to(x, shape, name=None):
             # [[1, 2, 3], [1, 2, 3]]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_expand(x, shape)
+        return _C_ops.expand(x, shape)
     if _in_legacy_dygraph():
-        return _C_ops.expand_v2(x, 'shape', shape)
+        return _legacy_C_ops.expand_v2(x, 'shape', shape)
 
     if isinstance(shape, Variable):
         assert len(shape.shape) == 1, ('shape must be an 1-D Tensor.')
@@ -3155,10 +3155,10 @@ def expand(x, shape, name=None):
             # [[1, 2, 3], [1, 2, 3]]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_expand(x, shape)
+        return _C_ops.expand(x, shape)
 
     if paddle.in_dynamic_mode():
-        return _C_ops.expand_v2(x, 'shape', shape)
+        return _legacy_C_ops.expand_v2(x, 'shape', shape)
 
     if isinstance(shape, Variable):
         assert len(shape.shape) == 1, ('shape must be an 1-D Tensor.')
@@ -3291,10 +3291,10 @@ def reshape(x, shape, name=None):
                 item.numpy().item(0)
                 if isinstance(item, tmp_tensor_type) else item for item in shape
             ]
-            out = _C_ops.final_state_reshape(x, shape)
+            out = _C_ops.reshape(x, shape)
         elif isinstance(shape, tmp_tensor_type):
             shape.stop_gradient = True
-            out = _C_ops.final_state_reshape(x, shape)
+            out = _C_ops.reshape(x, shape)
         else:
             raise ValueError(
                 "shape must be an instance of `list`, `tuple` or `Variable`,"
@@ -3313,10 +3313,10 @@ def reshape(x, shape, name=None):
                     item.numpy().item(0) if isinstance(item, Variable) else item
                     for item in shape
                 ]
-                out, _ = _C_ops.reshape2(x, None, 'shape', shape)
+                out, _ = _legacy_C_ops.reshape2(x, None, 'shape', shape)
             elif isinstance(shape, tmp_tensor_type):
                 shape.stop_gradient = True
-                out, _ = _C_ops.reshape2(x, shape)
+                out, _ = _legacy_C_ops.reshape2(x, shape)
             else:
                 raise ValueError(
                     "shape must be an instance of `list`, `tuple` or `Variable`,"
@@ -3409,10 +3409,10 @@ def reshape_(x, shape, name=None):
                 item.numpy().item(0)
                 if isinstance(item, tmp_tensor_type) else item for item in shape
             ]
-            out = _C_ops.final_state_reshape_(x, shape)
+            out = _C_ops.reshape_(x, shape)
         elif isinstance(shape, tmp_tensor_type):
             shape.stop_gradient = True
-            out = _C_ops.final_state_reshape_(x, shape)
+            out = _C_ops.reshape_(x, shape)
         else:
             raise ValueError(
                 "shape must be an instance of `list`, `tuple` or `Variable`,"
@@ -3425,7 +3425,7 @@ def reshape_(x, shape, name=None):
                 item.numpy().item(0) if isinstance(item, Variable) else item
                 for item in shape
             ]
-            out, _ = _C_ops.reshape2_(x, None, 'shape', shape)
+            out, _ = _legacy_C_ops.reshape2_(x, None, 'shape', shape)
             return out
         elif isinstance(shape, Variable):
             shape.stop_gradient = True
@@ -3435,7 +3435,7 @@ def reshape_(x, shape, name=None):
             # Thus, convert Shape Tensor to list firstly and then call
             # reshape inplace op.
             shape_list = shape.numpy().tolist()
-            out, _ = _C_ops.reshape2_(x, None, 'shape', shape_list)
+            out, _ = _legacy_C_ops.reshape2_(x, None, 'shape', shape_list)
             return out
 
 
@@ -3512,10 +3512,10 @@ def gather_nd(x, index, name=None):
 
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_gather_nd(x, index)
+        return _C_ops.gather_nd(x, index)
     else:
         if _in_legacy_dygraph():
-            return _C_ops.gather_nd(x, index)
+            return _legacy_C_ops.gather_nd(x, index)
     check_variable_and_dtype(
         x, 'x', ['bool', 'float32', 'float64', 'int16', 'int32', 'int64'],
         'gather_np')
@@ -3615,7 +3615,7 @@ def strided_slice(x, axes, starts, ends, strides, name=None):
             # sliced_2 is x[:, 1:3:1, 0:2:1, 2:4:2].
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_strided_slice(x, axes, starts, ends, strides)
+        return _C_ops.strided_slice(x, axes, starts, ends, strides)
 
     helper = LayerHelper('strided_slice', **locals())
 
@@ -3967,9 +3967,9 @@ def as_complex(x, name=None):
             #  [ 6. +7.j  8. +9.j 10.+11.j]]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_as_complex(x)
-    if _in_legacy_dygraph():
         return _C_ops.as_complex(x)
+    if _in_legacy_dygraph():
+        return _legacy_C_ops.as_complex(x)
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'as_complex')
     op_type = "as_complex"
@@ -4018,9 +4018,9 @@ def as_real(x, name=None):
             #   [10. 11.]]]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_as_real(x)
-    if _in_legacy_dygraph():
         return _C_ops.as_real(x)
+    if _in_legacy_dygraph():
+        return _legacy_C_ops.as_real(x)
 
     check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], 'as_real')
     op_type = "as_real"
@@ -4075,9 +4075,8 @@ def repeat_interleave(x, repeats, axis=None, name=None):
 
     if in_dygraph_mode():
         if isinstance(repeats, Variable):
-            return _C_ops.final_state_repeat_interleave_with_tensor_index(
-                x, repeats, axis)
-        return _C_ops.final_state_repeat_interleave(x, repeats, axis)
+            return _C_ops.repeat_interleave_with_tensor_index(x, repeats, axis)
+        return _C_ops.repeat_interleave(x, repeats, axis)
 
     helper = LayerHelper("repeat_interleave", **locals())
     check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
@@ -4181,11 +4180,11 @@ def moveaxis(x, source, destination, name=None):
         perm[dst_dims[i]] = src_dims[i]
 
     if in_dygraph_mode():
-        out = _C_ops.final_state_transpose(x, perm)
+        out = _C_ops.transpose(x, perm)
         return out
 
     if _in_legacy_dygraph():
-        out, _ = _C_ops.transpose2(x, 'axis', perm)
+        out, _ = _legacy_C_ops.transpose2(x, 'axis', perm)
         return out
 
     check_variable_and_dtype(x, 'x', [
@@ -4271,8 +4270,8 @@ def take_along_axis(arr, indices, axis):
         broadcast_shape = tuple(broadcast_shape_list)
         arr = paddle.broadcast_to(arr, broadcast_shape)
         if not _in_legacy_dygraph():
-            return _C_ops.final_state_take_along_axis(arr, indices, axis)
-        return _C_ops.take_along_axis(arr, indices, 'Axis', axis)
+            return _C_ops.take_along_axis(arr, indices, axis)
+        return _legacy_C_ops.take_along_axis(arr, indices, 'Axis', axis)
     check_variable_and_dtype(
         arr, 'x', ['float16', 'float32', 'float64', 'int32', 'int64', 'uint8'],
         'take_along_axis')
@@ -4336,10 +4335,9 @@ def put_along_axis(arr, indices, values, axis, reduce='assign'):
             indices = paddle.broadcast_to(indices, broadcast_shape)
         values = paddle.broadcast_to(values, indices.shape)
         if in_dygraph_mode():
-            return _C_ops.final_state_put_along_axis(arr, indices, values, axis,
-                                                     reduce)
-        return _C_ops.put_along_axis(arr, indices, values, "Axis", axis,
-                                     "Reduce", reduce)
+            return _C_ops.put_along_axis(arr, indices, values, axis, reduce)
+        return _legacy_C_ops.put_along_axis(arr, indices, values, "Axis", axis,
+                                            "Reduce", reduce)
 
     check_variable_and_dtype(
         arr, 'x', ['float16', 'float32', 'float64', 'int32', 'int64', 'uint8'],
@@ -4383,10 +4381,9 @@ def put_along_axis_(arr, indices, values, axis, reduce='assign'):
         indices = paddle.broadcast_to(indices, broadcast_shape)
     values = paddle.broadcast_to(values, indices.shape)
     if in_dygraph_mode():
-        return _C_ops.final_state_put_along_axis_(arr, indices, values, axis,
-                                                  reduce)
-    return _C_ops.put_along_axis_(arr, indices, values, "Axis", axis, "Reduce",
-                                  reduce)
+        return _C_ops.put_along_axis_(arr, indices, values, axis, reduce)
+    return _legacy_C_ops.put_along_axis_(arr, indices, values, "Axis", axis,
+                                         "Reduce", reduce)
 
 
 # TODO(dev): We need avoid implementing it by this way.
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 9dd1ab36fd0f3..aaa96e2afaeb7 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -69,7 +69,7 @@
 
 
 from ..fluid.layers import elementwise_sub
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 __all__ = []
 
@@ -115,9 +115,9 @@ def log(x, name=None):
             # [[0.693147, 1.09861, 1.38629], [1.94591, 2.07944, 2.19722]]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_log(x)
-    if _in_legacy_dygraph():
         return _C_ops.log(x)
+    if _in_legacy_dygraph():
+        return _legacy_C_ops.log(x)
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], "log")
     inputs = {'X': [x]}
@@ -176,10 +176,10 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
     """
 
     if in_dygraph_mode():
-        return _C_ops.final_state_scale(x, scale, float(bias), bias_after_scale)
+        return _C_ops.scale(x, scale, float(bias), bias_after_scale)
     if _non_static_mode():
         _scale = scale.numpy().item(0) if isinstance(scale, Variable) else scale
-        out = _C_ops.scale(x, 'scale',
+        out = _legacy_C_ops.scale(x, 'scale',
                            float(_scale), 'bias',
                            float(bias), 'bias_after_scale', bias_after_scale)
         return dygraph_utils._append_activation_in_dygraph(out)
@@ -234,7 +234,7 @@ def stanh(x, scale_a=0.67, scale_b=1.7159, name=None):
     """
 
     if _non_static_mode():
-        return _C_ops.stanh(x, 'scale_a', scale_a, 'scale_b', scale_b)
+        return _legacy_C_ops.stanh(x, 'scale_a', scale_a, 'scale_b', scale_b)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'stanh')
 
@@ -299,7 +299,7 @@ def multiplex(inputs, index, name=None):
 
     """
     if _non_static_mode():
-        return _C_ops.multiplex(index, inputs)
+        return _legacy_C_ops.multiplex(index, inputs)
     helper = LayerHelper('multiplex', **locals())
 
     check_type(inputs, 'inputs', (list), 'multiplex')
@@ -327,10 +327,10 @@ def scale_(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
     Please refer to :ref:`api_tensor_scale`.
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_scale_(x, scale, float(bias), bias_after_scale)
+        return _C_ops.scale_(x, scale, float(bias), bias_after_scale)
     if _in_legacy_dygraph():
         _scale = scale.numpy().item(0) if isinstance(scale, Variable) else scale
-        return _C_ops.scale_(x, 'scale',
+        return _legacy_C_ops.scale_(x, 'scale',
                                 float(_scale), 'bias',
                                 float(bias), 'bias_after_scale', bias_after_scale)
 
@@ -383,14 +383,14 @@ def pow(x, y, name=None):
     # in dynamic graph mode
     if in_dygraph_mode():
         if isinstance(y, (int, float)):
-            return _C_ops.final_state_pow(x, y)
+            return _C_ops.pow(x, y)
         elif isinstance(y, (paddle.Tensor, Variable)):
-            return _C_ops.final_state_elementwise_pow(x, y)
+            return _C_ops.elementwise_pow(x, y)
         else:
             raise TypeError('y must be scalar or tensor type, but received: %s '% (y.dtype))
     if _in_legacy_dygraph():
         if isinstance(y, (int, float)):
-            return _C_ops.pow(x, 'factor', y)
+            return _legacy_C_ops.pow(x, 'factor', y)
         elif isinstance(y, (paddle.Tensor, Variable)):
             return _elementwise_op_in_dygraph(
                 x, y, axis=-1, act=None, op_name='elementwise_pow')
@@ -415,16 +415,16 @@ def pow(x, y, name=None):
 
 
 OP_NAMEMAPPING = {
-    'elementwise_max': 'final_state_maximum',
-    'elementwise_min': 'final_state_minimum',
-    'elementwise_pow': 'final_state_elementwise_pow',
-    'elementwise_floordiv': 'final_state_floor_divide',
-    'elementwise_mod': 'final_state_modulo',
-    'elementwise_add': 'final_state_add',
-    'elementwise_sub': 'final_state_subtract',
-    'elementwise_mul': 'final_state_multiply',
-    'elementwise_div': 'final_state_divide',
-    'elementwise_mod': 'final_state_modulo',
+    'elementwise_max': 'maximum',
+    'elementwise_min': 'minimum',
+    'elementwise_pow': 'elementwise_pow',
+    'elementwise_floordiv': 'floor_divide',
+    'elementwise_mod': 'modulo',
+    'elementwise_add': 'add',
+    'elementwise_sub': 'subtract',
+    'elementwise_mul': 'multiply',
+    'elementwise_div': 'divide',
+    'elementwise_mod': 'modulo',
 }
 
 @dygraph_only
@@ -438,7 +438,7 @@ def is_inplace(op_name):
         return  op_name[-1] == "_"
 
     if op_name not in OP_NAMEMAPPING.keys() or axis != -1:
-        op = getattr(_C_ops, op_name)
+        op = getattr(_legacy_C_ops, op_name)
         out = op(x, y, 'axis', axis, 'use_mkldnn', use_mkldnn)
     else:
         if in_dygraph_mode():
@@ -446,7 +446,7 @@ def is_inplace(op_name):
             out = op(x, y)
 
         if _in_legacy_dygraph():
-            op = getattr(_C_ops, op_name)
+            op = getattr(_legacy_C_ops, op_name)
             out = op(x, y, 'axis', axis, 'use_mkldnn', use_mkldnn)
 
     return dygraph_utils._append_activation_in_dygraph(
@@ -545,10 +545,10 @@ def add(x, y, name=None):
     """
 
     if in_dygraph_mode():
-        return _C_ops.final_state_add( x, y)
+        return _C_ops.add( x, y)
     else:
         if _in_legacy_dygraph():
-            return _C_ops.elementwise_add(x, y)
+            return _legacy_C_ops.elementwise_add(x, y)
         else:
             return _elementwise_op(LayerHelper('elementwise_add', **locals()))
 
@@ -567,7 +567,7 @@ def add_(x, y, name=None):
         raise ValueError("The shape of broadcast output {} is different from that of inplace tensor {} in the Inplace operation.".format(out_shape, x.shape))
 
     if in_dygraph_mode():
-        return _C_ops.final_state_add_(x, y)
+        return _C_ops.add_(x, y)
     else:
         out = _elementwise_op_in_dygraph(
             x, y, axis=axis, op_name=op_type)
@@ -632,7 +632,7 @@ def subtract(x, y, name=None):
     axis = -1
     act = None
     if in_dygraph_mode():
-        return _C_ops.final_state_subtract(x, y)
+        return _C_ops.subtract(x, y)
     else:
         if _in_legacy_dygraph():
             return _elementwise_op_in_dygraph(
@@ -655,7 +655,7 @@ def subtract_(x, y, name=None):
         raise ValueError("The shape of broadcast output {} is different from that of inplace tensor {} in the Inplace operation.".format(out_shape, x.shape))
 
     if in_dygraph_mode():
-        return _C_ops.final_state_subtract_(x, y)
+        return _C_ops.subtract_(x, y)
     else:
         out = _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name='elementwise_sub_')
@@ -696,7 +696,7 @@ def divide(x, y, name=None):
     axis = -1
     act = None
     if in_dygraph_mode():
-        return _C_ops.final_state_divide( x, y)
+        return _C_ops.divide( x, y)
     else:
         if _in_legacy_dygraph():
             return _elementwise_op_in_dygraph(
@@ -737,9 +737,7 @@ def floor_divide(x, y, name=None):
     """
     op_type = 'elementwise_floordiv'
     axis = -1
-    if in_dygraph_mode():
-        return _C_ops.final_state_floor_divide(x, y)
-    if _in_legacy_dygraph():
+    if paddle.in_dynamic_mode():
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, op_name=op_type)
 
@@ -779,9 +777,7 @@ def remainder(x, y, name=None):
     """
     op_type = 'elementwise_mod'
     axis = -1
-    if in_dygraph_mode():
-        return _C_ops.final_state_modulo(x, y)
-    if _in_legacy_dygraph():
+    if paddle.in_dynamic_mode():
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, op_name=op_type)
 
@@ -832,7 +828,7 @@ def multiply(x, y, name=None):
     axis = -1
 
     if in_dygraph_mode():
-        return _C_ops.final_state_multiply(x, y)
+        return _C_ops.multiply(x, y)
     else:
         if _in_legacy_dygraph():
             return _elementwise_op_in_dygraph(
@@ -899,9 +895,7 @@ def maximum(x, y, name=None):
     op_type = 'elementwise_max'
     axis = -1
     act = None
-    if in_dygraph_mode():
-        return _C_ops.final_state_maximum(x, y)
-    if _in_legacy_dygraph():
+    if paddle.in_dynamic_mode():
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type)
     return _elementwise_op(LayerHelper(op_type, **locals()))
@@ -960,9 +954,7 @@ def minimum(x, y, name=None):
     op_type = 'elementwise_min'
     axis = -1
     act = None
-    if in_dygraph_mode():
-        return _C_ops.final_state_minimum(x, y)
-    if _in_legacy_dygraph():
+    if paddle.in_dynamic_mode():
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type)
     return _elementwise_op(LayerHelper(op_type, **locals()))
@@ -1024,7 +1016,7 @@ def fmax(x, y, name=None):
     axis = -1
     act = None
     if in_dygraph_mode():
-        return _C_ops.final_state_fmax(x, y, axis)
+        return _C_ops.fmax(x, y, axis)
     if _in_legacy_dygraph():
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type)
@@ -1087,7 +1079,7 @@ def fmin(x, y, name=None):
     axis = -1
     act = None
     if in_dygraph_mode():
-        return _C_ops.final_state_fmin(x, y, axis)
+        return _C_ops.fmin(x, y, axis)
     if _in_legacy_dygraph():
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type)
@@ -1165,7 +1157,7 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
     if in_dygraph_mode():
-        return _C_ops.final_state_sum(x, axis, dtype, keepdim)
+        return _C_ops.sum(x, axis, dtype, keepdim)
 
     if len(axis) == 0:
         reduce_all_flag = True
@@ -1178,11 +1170,11 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None):
     if _in_legacy_dygraph():
         axis = axis if axis != None and axis != [] else [0]
         if dtype_flag:
-            return _C_ops.reduce_sum(x, 'dim', axis, 'keep_dim', keepdim,
+            return _legacy_C_ops.reduce_sum(x, 'dim', axis, 'keep_dim', keepdim,
                                        'reduce_all', reduce_all_flag, 'in_dtype',
                                        x.dtype, 'out_dtype', dtype)
         else:
-            return _C_ops.reduce_sum(x, 'dim', axis, 'keep_dim', keepdim,
+            return _legacy_C_ops.reduce_sum(x, 'dim', axis, 'keep_dim', keepdim,
                                        'reduce_all', reduce_all_flag)
 
     attrs = {
@@ -1473,12 +1465,12 @@ def add_n(inputs, name=None):
             inputs = [inputs]
         for x in inputs:
             if not x.is_dense():
-                return _C_ops.sum(inputs, 'use_mkldnn', False)
-        return _C_ops.final_state_add_n(inputs)
+                return _legacy_C_ops.sum(inputs, 'use_mkldnn', False)
+        return _C_ops.add_n(inputs)
     if _in_legacy_dygraph():
         if isinstance(inputs, Variable):
             inputs = [inputs]
-        return _C_ops.sum(inputs, 'use_mkldnn', False)
+        return _legacy_C_ops.sum(inputs, 'use_mkldnn', False)
 
     helper = LayerHelper('add_n', **locals())
     check_type(inputs, 'inputs', (Variable, tuple, list), 'add_n')
@@ -1532,10 +1524,10 @@ def trunc(input, name=None):
             #         [0., 0.]]))
     '''
     if in_dygraph_mode():
-        return  _C_ops.final_state_trunc(input)
+        return  _C_ops.trunc(input)
     else:
         if _in_legacy_dygraph():
-            return _C_ops.trunc(input)
+            return _legacy_C_ops.trunc(input)
         else:
             inputs = {"X": input}
             attrs = {}
@@ -1618,9 +1610,9 @@ def mm(input, mat2, name=None):
 
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_matmul(input, mat2, False, False)
+        return _C_ops.matmul(input, mat2, False, False)
     elif paddle.in_dynamic_mode():
-        return _C_ops.matmul_v2(input, mat2)
+        return _legacy_C_ops.matmul_v2(input, mat2)
 
     def __check_input(x, y):
         var_names = {'x': x, 'y': y}
@@ -1729,10 +1721,10 @@ def addmm(input, x, y, beta=1.0, alpha=1.0, name=None):
 
 
     if in_dygraph_mode():
-        return _C_ops.final_state_addmm( input, x, y, alpha, beta)
+        return _C_ops.addmm( input, x, y, alpha, beta)
     else:
         if _in_legacy_dygraph():
-            out = _C_ops.addmm(input, x, y, "Alpha", alpha, "Beta", beta)
+            out = _legacy_C_ops.addmm(input, x, y, "Alpha", alpha, "Beta", beta)
             return out
         else:
             inputs = {'Input': input, "X": x, "Y": y}
@@ -1791,10 +1783,10 @@ def renorm(x, p, axis, max_norm):
             raise ValueError("the axis:{} should not be less than -1 * length of input_shape:{}".format(axis,-1 * len(input_shape)))
         axis = axis + len(input_shape)
     if in_dygraph_mode():
-        out = _C_ops.final_state_renorm(x, p, axis, max_norm)
+        out = _C_ops.renorm(x, p, axis, max_norm)
         return out
     elif _in_legacy_dygraph():
-        out = _C_ops.renorm(x, 'p',p, 'axis',axis, 'max_norm', max_norm)
+        out = _legacy_C_ops.renorm(x, 'p',p, 'axis',axis, 'max_norm', max_norm)
         return out
 
     inputs = {'X': x}
@@ -1849,9 +1841,9 @@ def inner(x, y, name=None):
         ny = y.reshape((-1, yshape[-1]))
 
         if in_dygraph_mode():
-            return _C_ops.final_state_matmul(nx, ny.T, False, False).reshape(dstshape)
+            return _C_ops.matmul(nx, ny.T, False, False).reshape(dstshape)
         elif paddle.in_dynamic_mode():
-            return _C_ops.matmul_v2(nx, ny.T).reshape(dstshape)
+            return _legacy_C_ops.matmul_v2(nx, ny.T).reshape(dstshape)
 
         def __check_input(x, y):
             var_names = {'x': x, 'y': y}
@@ -1913,9 +1905,9 @@ def outer(x, y, name=None):
     ny = y.reshape((1, -1))
 
     if in_dygraph_mode():
-        return _C_ops.final_state_matmul(nx, ny, False, False)
+        return _C_ops.matmul(nx, ny, False, False)
     elif paddle.in_dynamic_mode():
-        return _C_ops.matmul_v2(nx, ny)
+        return _legacy_C_ops.matmul_v2(nx, ny)
 
     def __check_input(x, y):
         var_names = {'x': x, 'y': y}
@@ -1986,9 +1978,9 @@ def logsumexp(x, axis=None, keepdim=False, name=None):
     if in_dygraph_mode():
         if reduce_all:
             axis = range(len(x.shape))
-        return _C_ops.final_state_logsumexp(x, axis, keepdim, reduce_all)
+        return _C_ops.logsumexp(x, axis, keepdim, reduce_all)
     if _in_legacy_dygraph():
-        return _C_ops.logsumexp(x, 'axis', axis, 'keepdim', keepdim, 'reduce_all', reduce_all)
+        return _legacy_C_ops.logsumexp(x, 'axis', axis, 'keepdim', keepdim, 'reduce_all', reduce_all)
 
     check_variable_and_dtype(x, 'x',
                              ['float32', 'float64'],
@@ -2030,9 +2022,9 @@ def inverse(x, name=None):
 
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_inverse(x)
-    elif paddle.in_dynamic_mode():
         return _C_ops.inverse(x)
+    elif paddle.in_dynamic_mode():
+        return _legacy_C_ops.inverse(x)
 
     def _check_input(x):
         check_variable_and_dtype(x, 'x',
@@ -2165,9 +2157,9 @@ def max(x, axis=None, keepdim=False, name=None):
 
     reduce_all, axis = _get_reduce_axis(axis)
     if in_dygraph_mode():
-        return _C_ops.final_state_max(x, axis, keepdim)
+        return _C_ops.max(x, axis, keepdim)
     if _in_legacy_dygraph():
-        return _C_ops.reduce_max(x, 'dim', axis, 'keep_dim', keepdim,
+        return _legacy_C_ops.reduce_max(x, 'dim', axis, 'keep_dim', keepdim,
                                    'reduce_all', reduce_all)
 
     helper = LayerHelper('max', **locals())
@@ -2266,10 +2258,10 @@ def min(x, axis=None, keepdim=False, name=None):
 
     reduce_all, axis = _get_reduce_axis(axis)
     if in_dygraph_mode():
-        return _C_ops.final_state_min(x, axis, keepdim)
+        return _C_ops.min(x, axis, keepdim)
 
     if _in_legacy_dygraph():
-        return _C_ops.reduce_min(x, 'dim', axis, 'keep_dim', keepdim,
+        return _legacy_C_ops.reduce_min(x, 'dim', axis, 'keep_dim', keepdim,
                                    'reduce_all', reduce_all)
 
     helper = LayerHelper('min', **locals())
@@ -2381,9 +2373,9 @@ def amax(x, axis=None, keepdim=False, name=None):
 
     reduce_all, axis = _get_reduce_axis(axis)
     if in_dygraph_mode():
-        return _C_ops.final_state_amax(x,  axis,  keepdim)
+        return _C_ops.amax(x,  axis,  keepdim)
     if _in_legacy_dygraph():
-        return _C_ops.reduce_amax(x, 'dim', axis, 'keep_dim', keepdim, 'reduce_all', reduce_all)
+        return _legacy_C_ops.reduce_amax(x, 'dim', axis, 'keep_dim', keepdim, 'reduce_all', reduce_all)
 
     helper = LayerHelper('amax', **locals())
     check_variable_and_dtype(
@@ -2495,9 +2487,9 @@ def amin(x, axis=None, keepdim=False, name=None):
 
     reduce_all, axis = _get_reduce_axis( axis )
     if in_dygraph_mode():
-        return _C_ops.final_state_amin(x, axis, keepdim)
+        return _C_ops.amin(x, axis, keepdim)
     elif _in_legacy_dygraph():
-        return _C_ops.reduce_amin(x, 'dim', axis, 'keep_dim', keepdim, 'reduce_all', reduce_all)
+        return _legacy_C_ops.reduce_amin(x, 'dim', axis, 'keep_dim', keepdim, 'reduce_all', reduce_all)
     helper = LayerHelper('amin', **locals())
     check_variable_and_dtype(
         x, 'x', ['float32', 'float64', 'int32', 'int64'], 'amin')
@@ -2540,9 +2532,9 @@ def log1p(x, name=None):
     """
 
     if in_dygraph_mode():
-        return _C_ops.final_state_log1p(x)
-    if _in_legacy_dygraph():
         return _C_ops.log1p(x)
+    if _in_legacy_dygraph():
+        return _legacy_C_ops.log1p(x)
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], "log1p")
     inputs = {'X': [x]}
@@ -2591,9 +2583,9 @@ def log2(x, name=None):
             print(res) # [1.0]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_log2(x)
-    if _in_legacy_dygraph():
         return _C_ops.log2(x)
+    if _in_legacy_dygraph():
+        return _legacy_C_ops.log2(x)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], "log2")
     inputs = {'X': [x]}
@@ -2643,9 +2635,9 @@ def log10(x, name=None):
             print(res) # [1.0]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_log10(x)
-    if _in_legacy_dygraph():
         return _C_ops.log10(x)
+    if _in_legacy_dygraph():
+        return _legacy_C_ops.log10(x)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], "log10")
     inputs = {'X': [x]}
@@ -2710,7 +2702,7 @@ def clip(x, min=None, max=None, name=None):
             max = max.numpy().item(0)
         min = min_ if min is None else min
         max = max_ if max is None else max
-        return _C_ops.final_state_clip(x, min, max)
+        return _C_ops.clip(x, min, max)
 
     if _in_legacy_dygraph():
         if isinstance(min, Variable):
@@ -2719,7 +2711,7 @@ def clip(x, min=None, max=None, name=None):
             max = max.numpy().item(0)
         min = min_ if min is None else min
         max = max_ if max is None else max
-        return _C_ops.clip(x, "min", min, "max", max)
+        return _legacy_C_ops.clip(x, "min", min, "max", max)
 
     if min is not None:
         check_type(min, 'min', (float, int, Variable), 'clip')
@@ -2774,10 +2766,10 @@ def clip_(x, min=None, max=None, name=None):
     max = fmax if max is None else max
 
     if in_dygraph_mode():
-        return _C_ops.final_state_clip_(x, min, max)
+        return _C_ops.clip_(x, min, max)
 
     if _in_legacy_dygraph():
-        return _C_ops.clip_(x, "min", min, "max", max)
+        return _legacy_C_ops.clip_(x, "min", min, "max", max)
 
 
 
@@ -2849,10 +2841,10 @@ def __check_input(x, offset, axis1, axis2):
                 "But received axis1 = %d, axis2 = %d\n"%(axis1, axis2)
 
     if in_dygraph_mode():
-        return _C_ops.final_state_trace( x, offset, axis1, axis2 )
+        return _C_ops.trace( x, offset, axis1, axis2 )
 
     if _in_legacy_dygraph():
-        return _C_ops.trace(x, 'offset', offset, 'axis1', axis1, 'axis2', axis2)
+        return _legacy_C_ops.trace(x, 'offset', offset, 'axis1', axis1, 'axis2', axis2)
 
     __check_input(x, offset, axis1, axis2)
 
@@ -2934,10 +2926,10 @@ def diagonal(x, offset=0, axis1=0, axis2=1, name=None):
             
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_diagonal(x, offset, axis1, axis2)
+        return _C_ops.diagonal(x, offset, axis1, axis2)
     else:
         if _in_legacy_dygraph():
-            return _C_ops.diagonal(x, 'offset', offset, 'axis1', axis1, 'axis2', axis2)
+            return _legacy_C_ops.diagonal(x, 'offset', offset, 'axis1', axis1, 'axis2', axis2)
 
     def __check_input(x, offset, axis1, axis2):
         check_dtype(x.dtype, 'Input',
@@ -3009,9 +3001,9 @@ def kron(x, y, name=None):
             #         [21, 24, 27, 28, 32, 36]])
     """
     if _in_legacy_dygraph():
-        return _C_ops.kron(x, y)
+        return _legacy_C_ops.kron(x, y)
     if in_dygraph_mode():
-        return _C_ops.final_state_kron(x, y)
+        return _C_ops.kron(x, y)
     helper = LayerHelper('kron', **locals())
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'], 'kron')
     check_variable_and_dtype(y, 'y', ['float16', 'float32', 'float64', 'int32', 'int64'], 'kron')
@@ -3071,12 +3063,12 @@ def cumsum(x, axis=None, dtype=None, name=None):
 
     if in_dygraph_mode():
         if axis is None: axis = -1
-        return _C_ops.final_state_cumsum(x, axis, flatten, False, False)
+        return _C_ops.cumsum(x, axis, flatten, False, False)
     if _in_legacy_dygraph():
         if axis is None:
-            return _C_ops.cumsum(x, 'flatten', flatten)
+            return _legacy_C_ops.cumsum(x, 'flatten', flatten)
         else:
-            return _C_ops.cumsum(x, 'axis', axis, 'flatten', flatten)
+            return _legacy_C_ops.cumsum(x, 'axis', axis, 'flatten', flatten)
 
     check_type(x, 'x', (Variable), 'cumsum')
     locals_var = locals().copy()
@@ -3145,12 +3137,12 @@ def logcumsumexp(x, axis=None, dtype=None, name=None):
 
     if in_dygraph_mode():
         if axis is None: axis = -1
-        return _C_ops.final_state_logcumsumexp(x, axis, flatten, False, False)
+        return _C_ops.logcumsumexp(x, axis, flatten, False, False)
     if _in_legacy_dygraph():
         if axis is None:
-            return _C_ops.logcumsumexp(x, 'flatten', flatten)
+            return _legacy_C_ops.logcumsumexp(x, 'flatten', flatten)
         else:
-            return _C_ops.logcumsumexp(x, 'axis', axis, 'flatten', flatten)
+            return _legacy_C_ops.logcumsumexp(x, 'axis', axis, 'flatten', flatten)
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], "logcumsumexp")
 
@@ -3211,9 +3203,9 @@ def cumprod(x, dim=None, dtype=None, name=None):
         x = cast(x, dtype)
 
     if in_dygraph_mode():
-        return _C_ops.final_state_cumprod(x, dim)
+        return _C_ops.cumprod(x, dim)
     if _in_legacy_dygraph():
-        return _C_ops.cumprod(x, 'dim', dim)
+        return _legacy_C_ops.cumprod(x, 'dim', dim)
 
     check_variable_and_dtype(x, "x", ['complex64', 'complex128', 'float32', 'float64', 'int32', 'int64'], 'cumprod')
     check_type(dim, 'dim', int, 'cumprod')
@@ -3245,9 +3237,9 @@ def isfinite(x, name=None):
             print(out)  # [False  True  True False  True False False]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_isfinite( x )
+        return _C_ops.isfinite( x )
     if _in_legacy_dygraph():
-        return _C_ops.isfinite_v2(x)
+        return _legacy_C_ops.isfinite_v2(x)
     helper = LayerHelper("isfinite_v2", **locals())
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'], 'isfinite')
     out = helper.create_variable_for_type_inference('bool')
@@ -3276,9 +3268,9 @@ def isinf(x, name=None):
             print(out)  # [ True False False  True False False False]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_isinf( x )
+        return _C_ops.isinf( x )
     if _in_legacy_dygraph():
-        return _C_ops.isinf_v2(x)
+        return _legacy_C_ops.isinf_v2(x)
     helper = LayerHelper("isinf_v2", **locals())
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'], 'isinf')
     out = helper.create_variable_for_type_inference(dtype='bool')
@@ -3307,10 +3299,10 @@ def isnan(x, name=None):
             print(out)  # [False False False False False  True  True]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_isnan( x )
+        return _C_ops.isnan( x )
 
     if _in_legacy_dygraph():
-        return _C_ops.isnan_v2(x)
+        return _legacy_C_ops.isnan_v2(x)
     helper = LayerHelper("isnan_v2", **locals())
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'], 'isnan')
     out = helper.create_variable_for_type_inference(dtype='bool')
@@ -3393,9 +3385,9 @@ def prod(x, axis=None, keepdim=False, dtype=None, name=None):
         dim = [0]
 
     if in_dygraph_mode():
-        return _C_ops.final_state_reduce_prod(x, dim, keepdim, reduce_all)
+        return _C_ops.reduce_prod(x, dim, keepdim, reduce_all)
     if _in_legacy_dygraph():
-        return _C_ops.reduce_prod(
+        return _legacy_C_ops.reduce_prod(
             x, 'dim', dim, 'keep_dim', keepdim, 'reduce_all', reduce_all)
 
     helper = LayerHelper('reduce_prod', **locals())
@@ -3435,10 +3427,10 @@ def sign(x, name=None):
           print(out)  # [1.0, 0.0, -1.0, 1.0]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_sign(x)
+        return _C_ops.sign(x)
 
     if _in_legacy_dygraph():
-        return _C_ops.sign(x)
+        return _legacy_C_ops.sign(x)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'sign')
     helper = LayerHelper("sign", **locals())
@@ -3475,10 +3467,10 @@ def tanh(x, name=None):
             # [-0.37994896 -0.19737532  0.09966799  0.29131261]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_tanh( x )
+        return _C_ops.tanh( x )
 
     if _in_legacy_dygraph():
-        return _C_ops.tanh(x)
+        return _legacy_C_ops.tanh(x)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'tanh')
     check_type(x, 'x', (Variable), 'tanh')
@@ -3494,8 +3486,8 @@ def tanh_(x, name=None):
     Please refer to :ref:`api_tensor_tanh`.
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_tanh_( x )
-    return _C_ops.tanh_(x)
+        return _C_ops.tanh_( x )
+    return _legacy_C_ops.tanh_(x)
 
 
 def increment(x, value=1.0, name=None):
@@ -3522,10 +3514,10 @@ def increment(x, value=1.0, name=None):
 
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_increment_( x, value)
+        return _C_ops.increment_(x, value)
 
     if _in_legacy_dygraph():
-        return _C_ops.increment(x, 'step', value)
+        return _legacy_C_ops.increment(x, 'step', value)
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
                              'increment')
@@ -3601,11 +3593,11 @@ def all(x, axis=None, keepdim=False, name=None):
     if in_dygraph_mode():
         if reduce_all_flag:
             axis = range(len(x.shape))
-        return _C_ops.final_state_all(x, axis, keepdim)
+        return _C_ops.all(x, axis, keepdim)
 
     if _in_legacy_dygraph():
         axis = axis if axis != None and axis != [] else [0]
-        return _C_ops.reduce_all(x, 'dim', axis, 'keep_dim', keepdim,
+        return _legacy_C_ops.reduce_all(x, 'dim', axis, 'keep_dim', keepdim,
                                        'reduce_all', reduce_all_flag)
 
     attrs = {
@@ -3692,11 +3684,11 @@ def any(x, axis=None, keepdim=False, name=None):
     if in_dygraph_mode():
         if reduce_all_flag:
             axis = range(len(x.shape))
-        return _C_ops.final_state_any(x, axis, keepdim)
+        return _C_ops.any(x, axis, keepdim)
 
     if _in_legacy_dygraph():
         axis = axis if axis != None and axis != [] else [0]
-        return _C_ops.reduce_any(x, 'dim', axis, 'keep_dim', keepdim,
+        return _legacy_C_ops.reduce_any(x, 'dim', axis, 'keep_dim', keepdim,
                                        'reduce_all', reduce_all_flag)
 
     attrs = {
@@ -3775,10 +3767,10 @@ def conj(x, name=None):
 
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_conj(x)
+        return _C_ops.conj(x)
 
     if paddle.in_dynamic_mode():
-        return _C_ops.conj(x)
+        return _legacy_C_ops.conj(x)
 
     check_variable_and_dtype(x, "x", ['complex64', 'complex128', 'float32', 'float64', 'int32', 'int64'], 'conj')
 
@@ -3816,10 +3808,10 @@ def digamma(x, name=None):
     """
 
     if in_dygraph_mode():
-        return _C_ops.final_state_digamma(x)
+        return _C_ops.digamma(x)
     else:
         if _in_legacy_dygraph():
-            return _C_ops.digamma(x)
+            return _legacy_C_ops.digamma(x)
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'digamma')
     helper = LayerHelper('digamma', **locals())
@@ -3853,9 +3845,9 @@ def lgamma(x, name=None):
             # [1.31452441, 1.76149750, 2.25271273, 1.09579802]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_lgamma(x)
-    elif _in_legacy_dygraph():
         return _C_ops.lgamma(x)
+    elif _in_legacy_dygraph():
+        return _legacy_C_ops.lgamma(x)
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'lgamma')
     helper = LayerHelper('lgamma', **locals())
@@ -3932,10 +3924,10 @@ def atan2(x, y, name=None):
     """
 
     if in_dygraph_mode():
-        return _C_ops.final_state_atan2( x, y)
+        return _C_ops.atan2( x, y)
     else:
         if _in_legacy_dygraph():
-            return _C_ops.atan2(x, y)
+            return _legacy_C_ops.atan2(x, y)
         else:
             check_variable_and_dtype(x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'], 'atan2')
             check_variable_and_dtype(y, 'y', ['int32', 'int64', 'float16', 'float32', 'float64'], 'atan2')
@@ -3991,9 +3983,9 @@ def logit(x, eps=None, name=None):
     if eps == None:
         eps = 0.0
     if _in_legacy_dygraph():
-        return _C_ops.logit(x, 'eps', eps)
+        return _legacy_C_ops.logit(x, 'eps', eps)
     if in_dygraph_mode():
-        return _C_ops.final_state_logit(x, eps)
+        return _C_ops.logit(x, eps)
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'logit')
     helper = LayerHelper("logit", **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
@@ -4039,11 +4031,11 @@ def lerp(x, y, weight, name=None):
         if isinstance(weight, float):
             weight = paddle.to_tensor(weight, dtype=x.dtype)
 
-        return _C_ops.final_state_lerp( x, y, weight)
+        return _C_ops.lerp( x, y, weight)
     if _in_legacy_dygraph():
         if isinstance(weight, float):
             weight = paddle.to_tensor(weight, dtype=x.dtype)
-        return _C_ops.lerp(x, y, weight)
+        return _legacy_C_ops.lerp(x, y, weight)
 
     if isinstance(weight, float):
         weight = paddle.full(shape=[1], fill_value=weight, dtype=x.dtype)
@@ -4073,8 +4065,8 @@ def lerp_(x, y, weight, name=None):
     if out_shape != x.shape:
         raise ValueError("The shape of broadcast output {} is different from that of inplace tensor {} in the Inplace operation.".format(out_shape, x.shape))
     if in_dygraph_mode():
-        return _C_ops.final_state_lerp_( x, y, weight)
-    return _C_ops.lerp_(x, y, weight)
+        return _C_ops.lerp_( x, y, weight)
+    return _legacy_C_ops.lerp_(x, y, weight)
 
 def erfinv(x, name=None):
     r"""
@@ -4103,12 +4095,12 @@ def erfinv(x, name=None):
 
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_erfinv( x )
+        return _C_ops.erfinv( x )
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'erfinv')
 
     if paddle.in_dynamic_mode():
-        return _C_ops.erfinv(x)
+        return _legacy_C_ops.erfinv(x)
 
     helper = LayerHelper('erfinv', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -4123,8 +4115,8 @@ def erfinv_(x, name=None):
     """
     check_type(x, 'x', (paddle.Tensor, Variable), 'erfinv')
     if in_dygraph_mode():
-        return _C_ops.final_state_erfinv_( x )
-    return _C_ops.erfinv_(x)
+        return _C_ops.erfinv_( x )
+    return _legacy_C_ops.erfinv_(x)
 
 def rad2deg(x, name=None):
     r"""
@@ -4171,11 +4163,11 @@ def rad2deg(x, name=None):
     if in_dygraph_mode():
         if convert_dtype(x.dtype) in ['int32', 'int64']:
             x = cast(x, dtype="float32")
-        return _C_ops.final_state_scale(x, rad2deg_scale, 0.0, True)
+        return _C_ops.scale(x, rad2deg_scale, 0.0, True)
     elif paddle.in_dynamic_mode():
         if convert_dtype(x.dtype) in ['int32', 'int64']:
             x = cast(x, dtype="float32")
-        return _C_ops.scale(x, 'scale', rad2deg_scale)
+        return _legacy_C_ops.scale(x, 'scale', rad2deg_scale)
     else:
         check_variable_and_dtype(x, 'x', ['int32', 'int64', 'float32', 'float64'], 'rad2deg')
         helper = LayerHelper('rad2deg', **locals())
@@ -4228,11 +4220,11 @@ def deg2rad(x, name=None):
     if in_dygraph_mode():
         if convert_dtype(x.dtype) in ['int32', 'int64']:
             x = cast(x, dtype="float32")
-        return _C_ops.final_state_scale(x, deg2rad_scale, 0.0, True)
+        return _C_ops.scale(x, deg2rad_scale, 0.0, True)
     elif paddle.in_dynamic_mode():
         if convert_dtype(x.dtype) in ['int32', 'int64']:
             x = cast(x, dtype="float32")
-        return _C_ops.scale(x, 'scale', deg2rad_scale)
+        return _legacy_C_ops.scale(x, 'scale', deg2rad_scale)
     else:
         check_variable_and_dtype(x, 'x', ['int32', 'int64', 'float32', 'float64'], 'deg2rad')
         helper = LayerHelper('deg2rad', **locals())
@@ -4459,7 +4451,7 @@ def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
             input_list = [x, append]
             has_pend = True
         if has_pend:
-            new_input = _C_ops.final_state_concat(input_list, axis)
+            new_input = _C_ops.concat(input_list, axis)
         else:
             new_input = x
 
@@ -4472,19 +4464,19 @@ def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
         attrs_1 += ('starts', starts_1)
         ends_1 = [dim_len - 1]
         attrs_1 += ('ends', ends_1)
-        input_front = _C_ops.final_state_slice(new_input, axes, starts_1, ends_1, infer_flags,
+        input_front = _C_ops.slice(new_input, axes, starts_1, ends_1, infer_flags,
                                             [])
         starts_2 = [1]
         attrs_2 += ('starts', starts_2)
         ends_2 = [dim_len]
         attrs_2 += ('ends', ends_2)
-        input_back = _C_ops.final_state_slice(new_input, axes, starts_2, ends_2, infer_flags,
+        input_back = _C_ops.slice(new_input, axes, starts_2, ends_2, infer_flags,
                                             [])
 
         if x.dtype == paddle.bool:
-            return _C_ops.final_state_logical_xor(input_back, input_front)
+            return _C_ops.logical_xor(input_back, input_front)
         else:
-            return _C_ops.final_state_subtract(input_back, input_front)
+            return _C_ops.subtract(input_back, input_front)
     elif _in_legacy_dygraph():
         has_pend = False
         input_list = []
@@ -4499,7 +4491,7 @@ def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
             has_pend = True
         if has_pend:
             new_input = _varbase_creator()
-            _C_ops.concat(input_list, new_input, 'axis', axis)
+            _legacy_C_ops.concat(input_list, new_input, 'axis', axis)
         else:
             new_input = x
 
@@ -4512,17 +4504,17 @@ def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
         attrs_1 += ('starts', starts_1)
         ends_1 = [dim_len - 1]
         attrs_1 += ('ends', ends_1)
-        input_front = _C_ops.slice(new_input, None, None, None, None, 'axes', axes, \
+        input_front = _legacy_C_ops.slice(new_input, None, None, None, None, 'axes', axes, \
                 'infer_flags', infer_flags, *attrs_1)
         starts_2 = [1]
         attrs_2 += ('starts', starts_2)
         ends_2 = [dim_len]
         attrs_2 += ('ends', ends_2)
-        input_back = _C_ops.slice(new_input, None, None, None, None, 'axes', axes, \
+        input_back = _legacy_C_ops.slice(new_input, None, None, None, None, 'axes', axes, \
                 'infer_flags', infer_flags, *attrs_2)
 
         if x.dtype == paddle.bool:
-            return _C_ops.logical_xor(input_back, input_front)
+            return _legacy_C_ops.logical_xor(input_back, input_front)
         else:
             return elementwise_sub(input_back, input_front, axis=axis)
     else:
@@ -4619,9 +4611,9 @@ def angle(x, name=None):
     """
 
     if in_dygraph_mode():
-        return _C_ops.final_state_angle(x)
-    elif paddle.in_dynamic_mode():
         return _C_ops.angle(x)
+    elif paddle.in_dynamic_mode():
+        return _legacy_C_ops.angle(x)
 
     check_variable_and_dtype(x, 'x',
         ['float32', 'float64', 'complex64', 'complex128'], 'angle')
@@ -4717,11 +4709,11 @@ def frac(x, name=None):
         raise TypeError(
             "The data type of input must be one of ['int32', 'int64', 'float32', 'float64'], but got {}".format(x.dtype))
     if in_dygraph_mode():
-        y = _C_ops.final_state_trunc(x)
-        return _C_ops.final_state_subtract(x, y)
+        y = _C_ops.trunc(x)
+        return _C_ops.subtract(x, y)
     else:
         if _in_legacy_dygraph():
-            y = _C_ops.trunc(x)
+            y = _legacy_C_ops.trunc(x)
             return _elementwise_op_in_dygraph(
                 x, y, axis=axis, act=act, op_name=op_type)
         else:
diff --git a/python/paddle/tensor/ops.py b/python/paddle/tensor/ops.py
index f9595f4218570..4c3f7c55c494b 100644
--- a/python/paddle/tensor/ops.py
+++ b/python/paddle/tensor/ops.py
@@ -20,7 +20,7 @@
 from ..static import Variable
 from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 from ..fluid.framework import in_dygraph_mode
-from .. import _C_ops
+from .. import _C_ops, _legacy_C_ops
 
 __deprecated_func_name__ = {
     'tanh_shrink': 'tanhshrink',
@@ -513,7 +513,7 @@
 
 def erf(x, name=None):
     if in_dygraph_mode():
-        return _C_ops.final_state_erf(x)
+        return _C_ops.erf(x)
 
     locals_var = locals().copy()
     kwargs = dict()
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index 709f61d5ca5d1..25c825cda34fd 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -20,7 +20,7 @@
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, check_shape
 from ..fluid.layers import utils
 import paddle
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.static import Variable
 from paddle.fluid.framework import in_dygraph_mode, _in_legacy_dygraph, _current_expected_place
 
@@ -66,10 +66,10 @@ def bernoulli(x, name=None):
     """
 
     if in_dygraph_mode():
-        return _C_ops.final_state_bernoulli(x)
+        return _C_ops.bernoulli(x)
 
     if _in_legacy_dygraph():
-        return _C_ops.bernoulli(x)
+        return _legacy_C_ops.bernoulli(x)
 
     check_variable_and_dtype(x, "x", ["float32", "float64"], "bernoulli")
 
@@ -115,10 +115,10 @@ def poisson(x, name=None):
 
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_poisson(x)
+        return _C_ops.poisson(x)
 
     if paddle.in_dynamic_mode():
-        return _C_ops.poisson(x)
+        return _legacy_C_ops.poisson(x)
 
     check_variable_and_dtype(x, "x", ["float32", "float64"], "poisson")
 
@@ -183,11 +183,11 @@ def multinomial(x, num_samples=1, replacement=False, name=None):
         "multinomial op is not supported on ROCM yet.")
 
     if in_dygraph_mode():
-        return _C_ops.final_state_multinomial(x, num_samples, replacement)
+        return _C_ops.multinomial(x, num_samples, replacement)
 
     if _in_legacy_dygraph():
-        return _C_ops.multinomial(x, 'num_samples', num_samples, 'replacement',
-                                  replacement)
+        return _legacy_C_ops.multinomial(x, 'num_samples', num_samples,
+                                         'replacement', replacement)
 
     check_variable_and_dtype(x, "x", ["float32", "float64"], "multinomial")
 
@@ -245,15 +245,15 @@ def gaussian(shape, mean=0.0, std=1.0, dtype=None, name=None):
     if in_dygraph_mode():
         shape = utils.convert_shape_to_list(shape)
         place = _current_expected_place()
-        return _C_ops.final_state_gaussian_random(shape, float(mean),
-                                                  float(std), seed, dtype,
-                                                  place)
+        return _C_ops.gaussian_random(shape, float(mean), float(std), seed,
+                                      dtype, place)
 
     if _in_legacy_dygraph():
         shape = utils.convert_shape_to_list(shape)
-        return _C_ops.gaussian_random('shape',
-                                      shape, 'mean', float(mean), 'std',
-                                      float(std), 'seed', seed, 'dtype', dtype)
+        return _legacy_C_ops.gaussian_random('shape', shape,
+                                             'mean', float(mean), 'std',
+                                             float(std), 'seed', seed, 'dtype',
+                                             dtype)
 
     check_shape(shape, op_type_for_check)
     check_dtype(dtype, 'dtype', ['float32', 'float64'], op_type_for_check)
@@ -557,14 +557,15 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
 
     if in_dygraph_mode():
         shape = utils.convert_shape_to_list(shape)
-        return _C_ops.final_state_uniform_random(shape, dtype, float(min),
-                                                 float(max), seed,
-                                                 _current_expected_place())
+        return _C_ops.uniform_random(shape, dtype, float(min), float(max), seed,
+                                     _current_expected_place())
 
     if _in_legacy_dygraph():
         shape = utils.convert_shape_to_list(shape)
-        return _C_ops.uniform_random('shape', shape, 'min', float(min), 'max',
-                                     float(max), 'seed', seed, 'dtype', dtype)
+        return _legacy_C_ops.uniform_random('shape',
+                                            shape, 'min', float(min), 'max',
+                                            float(max), 'seed', seed, 'dtype',
+                                            dtype)
 
     check_type(shape, 'shape', (list, tuple, Variable), 'uniform/rand')
     check_dtype(dtype, 'dtype', ('float32', 'float64'), 'uniform/rand')
@@ -624,11 +625,10 @@ def uniform_(x, min=-1.0, max=1.0, seed=0, name=None):
             #  [ 0.433519,    0.39483607, -0.8660099,   0.83664286]] # random
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_uniform_random_inplace_(
-            x, min, max, seed, 0, 0, 1.0)
+        return _C_ops.uniform_random_inplace_(x, min, max, seed, 0, 0, 1.0)
     else:
-        return _C_ops.uniform_random_inplace_(x, 'min', min, 'max', max, 'seed',
-                                              seed)
+        return _legacy_C_ops.uniform_random_inplace_(x, 'min', min, 'max', max,
+                                                     'seed', seed)
 
 
 def randint(low=0, high=None, shape=[1], dtype=None, name=None):
@@ -711,11 +711,11 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
     if in_dygraph_mode():
         shape = utils.convert_shape_to_list(shape)
         place = _current_expected_place()
-        return _C_ops.final_state_randint(low, high, shape, dtype, place)
+        return _C_ops.randint(low, high, shape, dtype, place)
     if _in_legacy_dygraph():
         shape = utils.convert_shape_to_list(shape)
-        return _C_ops.randint('shape', shape, 'low', low, 'high', high, 'seed',
-                              0, 'dtype', dtype)
+        return _legacy_C_ops.randint('shape', shape, 'low', low, 'high', high,
+                                     'seed', 0, 'dtype', dtype)
 
     check_shape(shape, 'randint')
     check_dtype(dtype, 'dtype', ['int32', 'int64'], 'randint')
@@ -885,8 +885,9 @@ def randint_like(x, low=0, high=None, dtype=None, name=None):
 
     if paddle.in_dynamic_mode():
         shape = utils.convert_shape_to_list(shape)
-        out = _C_ops.randint('shape', shape, 'low', low, 'high', high, 'seed',
-                             0, 'dtype', core.VarDesc.VarType.INT64)
+        out = _legacy_C_ops.randint('shape', shape, 'low', low, 'high', high,
+                                    'seed', 0, 'dtype',
+                                    core.VarDesc.VarType.INT64)
         out = paddle.cast(out, dtype)
         return out
 
@@ -949,9 +950,9 @@ def randperm(n, dtype="int64", name=None):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
     if in_dygraph_mode():
-        return _C_ops.final_state_randperm(n, dtype, _current_expected_place())
+        return _C_ops.randperm(n, dtype, _current_expected_place())
     if _in_legacy_dygraph():
-        return _C_ops.randperm('n', n, 'seed', 0, 'dtype', dtype)
+        return _legacy_C_ops.randperm('n', n, 'seed', 0, 'dtype', dtype)
 
     if n < 1:
         raise ValueError("The input n should be greater than 0 in randperm op.")
@@ -1056,9 +1057,9 @@ def exponential_(x, lam=1.0, name=None):
 
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_exponential_(x, lam)
+        return _C_ops.exponential_(x, lam)
     elif paddle.in_dynamic_mode():
-        return _C_ops.exponential_(x, "lambda", lam)
+        return _legacy_C_ops.exponential_(x, "lambda", lam)
 
     check_variable_and_dtype(x, "x", ["float32", "float64"], "exponential")
 
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index 0324766d3ec43..b740a100358ca 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -22,7 +22,7 @@
 from paddle.common_ops_import import convert_np_dtype_to_dtype_
 from paddle.common_ops_import import Variable
 from paddle.common_ops_import import VarDesc
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from .logic import logical_not
 
 # TODO: define searching & indexing functions of a tensor
@@ -93,11 +93,12 @@ def argsort(x, axis=-1, descending=False, name=None):
             #  [0 2 1 1]]]
     """
     if in_dygraph_mode():
-        _, ids = _C_ops.final_state_argsort(x, axis, descending)
+        _, ids = _C_ops.argsort(x, axis, descending)
         return ids
 
     if _in_legacy_dygraph():
-        _, ids = _C_ops.argsort(x, 'axis', axis, 'descending', descending)
+        _, ids = _legacy_C_ops.argsort(x, 'axis', axis, 'descending',
+                                       descending)
         return ids
     check_variable_and_dtype(
         x, 'x', ['float32', 'float64', 'int16', 'int32', 'int64', 'uint8'],
@@ -178,10 +179,10 @@ def argmax(x, axis=None, keepdim=False, dtype="int64", name=None):
         axis = 0
 
     if in_dygraph_mode():
-        return _C_ops.final_state_argmax(x, axis, keepdim, flatten, var_dtype)
+        return _C_ops.argmax(x, axis, keepdim, flatten, var_dtype)
     if _in_legacy_dygraph():
-        out = _C_ops.arg_max(x, 'axis', axis, 'dtype', var_dtype, 'keepdims',
-                             keepdim, 'flatten', flatten)
+        out = _legacy_C_ops.arg_max(x, 'axis', axis, 'dtype', var_dtype,
+                                    'keepdims', keepdim, 'flatten', flatten)
         return out
 
     helper = LayerHelper("argmax", **locals())
@@ -260,10 +261,10 @@ def argmin(x, axis=None, keepdim=False, dtype="int64", name=None):
         axis = 0
 
     if in_dygraph_mode():
-        return _C_ops.final_state_argmin(x, axis, keepdim, flatten, var_dtype)
+        return _C_ops.argmin(x, axis, keepdim, flatten, var_dtype)
     if _in_legacy_dygraph():
-        out = _C_ops.arg_min(x, 'axis', axis, 'dtype', var_dtype, 'keepdims',
-                             keepdim, 'flatten', flatten)
+        out = _legacy_C_ops.arg_min(x, 'axis', axis, 'dtype', var_dtype,
+                                    'keepdims', keepdim, 'flatten', flatten)
         return out
 
     helper = LayerHelper("argmin", **locals())
@@ -322,10 +323,10 @@ def index_select(x, index, axis=0, name=None):
     """
 
     if in_dygraph_mode():
-        return _C_ops.final_state_index_select(x, index, axis)
+        return _C_ops.index_select(x, index, axis)
 
     if _in_legacy_dygraph():
-        return _C_ops.index_select(x, index, 'dim', axis)
+        return _legacy_C_ops.index_select(x, index, 'dim', axis)
 
     helper = LayerHelper("index_select", **locals())
     check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
@@ -402,9 +403,9 @@ def nonzero(x, as_tuple=False):
     rank = len(shape)
 
     if in_dygraph_mode():
-        outs = _C_ops.final_state_where_index(x)
-    elif paddle.in_dynamic_mode():
         outs = _C_ops.where_index(x)
+    elif paddle.in_dynamic_mode():
+        outs = _legacy_C_ops.where_index(x)
     else:
         helper = LayerHelper("where_index", **locals())
 
@@ -483,11 +484,12 @@ def sort(x, axis=-1, descending=False, name=None):
             #  [5. 7. 7. 9.]]]
     """
     if in_dygraph_mode():
-        outs, _ = _C_ops.final_state_argsort(x, axis, descending)
+        outs, _ = _C_ops.argsort(x, axis, descending)
         return outs
 
     if _in_legacy_dygraph():
-        outs, _ = _C_ops.argsort(x, 'axis', axis, 'descending', descending)
+        outs, _ = _legacy_C_ops.argsort(x, 'axis', axis, 'descending',
+                                        descending)
         return outs
     helper = LayerHelper("sort", **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype,
@@ -539,9 +541,9 @@ def mode(x, axis=-1, keepdim=False, name=None):
            
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_mode(x, axis, keepdim)
+        return _C_ops.mode(x, axis, keepdim)
     if _in_legacy_dygraph():
-        return _C_ops.mode(x, "axis", axis, "keepdim", keepdim)
+        return _legacy_C_ops.mode(x, "axis", axis, "keepdim", keepdim)
 
     helper = LayerHelper("mode", **locals())
     inputs = {"X": [x]}
@@ -658,11 +660,11 @@ def where(condition, x=None, y=None, name=None):
         broadcast_condition = paddle.cast(broadcast_condition, 'bool')
 
     if in_dygraph_mode():
-        return _C_ops.final_state_where(broadcast_condition, broadcast_x,
-                                        broadcast_y)
+        return _C_ops.where(broadcast_condition, broadcast_x, broadcast_y)
     else:
         if _in_legacy_dygraph():
-            return _C_ops.where(broadcast_condition, broadcast_x, broadcast_y)
+            return _legacy_C_ops.where(broadcast_condition, broadcast_x,
+                                       broadcast_y)
         else:
             helper = LayerHelper("where", **locals())
             out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -752,10 +754,10 @@ def index_sample(x, index):
 
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_index_sample(x, index)
+        return _C_ops.index_sample(x, index)
     else:
         if _in_legacy_dygraph():
-            return _C_ops.index_sample(x, index)
+            return _legacy_C_ops.index_sample(x, index)
         else:
             helper = LayerHelper("index_sample", **locals())
             check_variable_and_dtype(x, 'x',
@@ -804,10 +806,10 @@ def masked_select(x, mask, name=None):
     """
 
     if in_dygraph_mode():
-        return _C_ops.final_state_masked_select(x, mask)
+        return _C_ops.masked_select(x, mask)
 
     if _in_legacy_dygraph():
-        return _C_ops.masked_select(x, mask)
+        return _legacy_C_ops.masked_select(x, mask)
 
     helper = LayerHelper("masked_select", **locals())
     check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
@@ -875,16 +877,17 @@ def topk(x, k, axis=None, largest=True, sorted=True, name=None):
     if in_dygraph_mode():
         if axis == None:
             axis = -1
-        out, indices = _C_ops.final_state_top_k(x, k, axis, largest, sorted)
+        out, indices = _C_ops.top_k(x, k, axis, largest, sorted)
         return out, indices
 
     if _non_static_mode():
         if axis is None:
-            out, indices = _C_ops.top_k_v2(x, 'k', int(k), 'largest', largest,
-                                           'sorted', sorted)
+            out, indices = _legacy_C_ops.top_k_v2(x, 'k', int(k), 'largest',
+                                                  largest, 'sorted', sorted)
         else:
-            out, indices = _C_ops.top_k_v2(x, 'k', int(k), 'axis', axis,
-                                           'largest', largest, 'sorted', sorted)
+            out, indices = _legacy_C_ops.top_k_v2(x, 'k', int(k), 'axis', axis,
+                                                  'largest', largest, 'sorted',
+                                                  sorted)
         return out, indices
 
     helper = LayerHelper("top_k_v2", **locals())
@@ -1015,12 +1018,11 @@ def searchsorted(sorted_sequence,
             
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_searchsorted(sorted_sequence, values,
-                                               out_int32, right)
+        return _C_ops.searchsorted(sorted_sequence, values, out_int32, right)
 
     if _in_legacy_dygraph():
-        return _C_ops.searchsorted(sorted_sequence, values, "out_int32",
-                                   out_int32, "right", right)
+        return _legacy_C_ops.searchsorted(sorted_sequence, values, "out_int32",
+                                          out_int32, "right", right)
 
     check_variable_and_dtype(sorted_sequence, 'SortedSequence',
                              ['float32', 'float64', 'int32', 'int64'],
@@ -1087,13 +1089,13 @@ def kthvalue(x, k, axis=None, keepdim=False, name=None):
     if _non_static_mode():
         if axis is not None:
             if _in_legacy_dygraph():
-                return _C_ops.kthvalue(x, 'k', k, "axis", axis, "keepdim",
-                                       keepdim)
-            return _C_ops.final_state_kthvalue(x, k, axis, keepdim)
+                return _legacy_C_ops.kthvalue(x, 'k', k, "axis", axis,
+                                              "keepdim", keepdim)
+            return _C_ops.kthvalue(x, k, axis, keepdim)
         else:
             if _in_legacy_dygraph():
-                return _C_ops.kthvalue(x, 'k', k, "keepdim", keepdim)
-            return _C_ops.final_state_kthvalue(x, k, -1, keepdim)
+                return _legacy_C_ops.kthvalue(x, 'k', k, "keepdim", keepdim)
+            return _C_ops.kthvalue(x, k, -1, keepdim)
 
     helper = LayerHelper("kthvalue", **locals())
     inputs = {"X": [x]}
diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py
index 4a45334f6fe8f..043449cd6d81d 100644
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -22,7 +22,7 @@
 from .search import where
 from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 import paddle
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 __all__ = []
 
@@ -91,10 +91,10 @@ def mean(x, axis=None, keepdim=False, name=None):
     if in_dygraph_mode():
         if reduce_all:
             axis = range(len(x.shape))
-        return _C_ops.final_state_mean(x, axis, keepdim)
+        return _C_ops.mean(x, axis, keepdim)
     if _in_legacy_dygraph():
-        return _C_ops.reduce_mean(x, 'dim', axis, 'keep_dim', keepdim,
-                                  'reduce_all', reduce_all)
+        return _legacy_C_ops.reduce_mean(x, 'dim', axis, 'keep_dim', keepdim,
+                                         'reduce_all', reduce_all)
 
     check_variable_and_dtype(x, 'x/input',
                              ['uint16', 'float16', 'float32', 'float64'],
@@ -234,9 +234,9 @@ def numel(x, name=None):
 
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_size(x)
-    elif _in_legacy_dygraph():
         return _C_ops.size(x)
+    elif _in_legacy_dygraph():
+        return _legacy_C_ops.size(x)
 
     if not isinstance(x, Variable):
         raise TypeError("x must be a Tensor in numel")
@@ -322,8 +322,8 @@ def nanmedian(x, axis=None, keepdim=True, name=None):
         raise ValueError("Axis has duplicated elements.")
 
     if _in_legacy_dygraph():
-        median_index, out = _C_ops.nanmedian(x, 'axis', axis, 'keepdim',
-                                             keepdim)
+        median_index, out = _legacy_C_ops.nanmedian(x, 'axis', axis, 'keepdim',
+                                                    keepdim)
         return out
 
     check_variable_and_dtype(
diff --git a/python/paddle/text/viterbi_decode.py b/python/paddle/text/viterbi_decode.py
index cf6bdd04c2692..a3c81b9c8e628 100644
--- a/python/paddle/text/viterbi_decode.py
+++ b/python/paddle/text/viterbi_decode.py
@@ -16,7 +16,7 @@
 from ..fluid.framework import core, _non_static_mode, in_dygraph_mode
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 __all__ = ['viterbi_decode', 'ViterbiDecoder']
 
@@ -59,12 +59,13 @@ def viterbi_decode(potentials,
             scores, path = paddle.text.viterbi_decode(emission, transition, length, False) # scores: [3.37089300, 1.56825531], path: [[1, 0, 0], [1, 1, 0]]
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_viterbi_decode(potentials, transition_params,
-                                                 lengths, include_bos_eos_tag)
+        return _C_ops.viterbi_decode(potentials, transition_params, lengths,
+                                     include_bos_eos_tag)
 
     if _non_static_mode():
-        return _C_ops.viterbi_decode(potentials, transition_params, lengths,
-                                     'include_bos_eos_tag', include_bos_eos_tag)
+        return _legacy_C_ops.viterbi_decode(potentials, transition_params,
+                                            lengths, 'include_bos_eos_tag',
+                                            include_bos_eos_tag)
     check_variable_and_dtype(potentials, 'input', ['float32', 'float64'],
                              'viterbi_decode')
     check_variable_and_dtype(transition_params, 'transitions',
diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index acd896e71e89d..032fe4bd35607 100755
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -21,7 +21,7 @@
 from ..fluid.initializer import Normal
 from ..fluid.framework import _non_static_mode, in_dygraph_mode, _in_legacy_dygraph
 from paddle.common_ops_import import *
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 __all__ = [  #noqa
     'yolo_loss', 'yolo_box', 'deform_conv2d', 'DeformConv2D',
@@ -187,13 +187,14 @@ def yolo_loss(x,
     """
 
     if in_dygraph_mode():
-        loss, _, _ = _C_ops.final_state_yolov3_loss(
-            x, gt_box, gt_label, gt_score, anchors, anchor_mask, class_num,
-            ignore_thresh, downsample_ratio, use_label_smooth, scale_x_y)
+        loss, _, _ = _C_ops.yolov3_loss(x, gt_box, gt_label, gt_score, anchors,
+                                        anchor_mask, class_num, ignore_thresh,
+                                        downsample_ratio, use_label_smooth,
+                                        scale_x_y)
         return loss
 
     if _non_static_mode():
-        loss, _, _ = _C_ops.yolov3_loss(
+        loss, _, _ = _legacy_C_ops.yolov3_loss(
             x, gt_box, gt_label, gt_score, 'anchors', anchors, 'anchor_mask',
             anchor_mask, 'class_num', class_num, 'ignore_thresh', ignore_thresh,
             'downsample_ratio', downsample_ratio, 'use_label_smooth',
@@ -375,15 +376,14 @@ def yolo_box(x,
                                                    scale_x_y=1.)
     """
     if in_dygraph_mode():
-        boxes, scores = _C_ops.final_state_yolo_box(x, img_size, anchors,
-                                                    class_num, conf_thresh,
-                                                    downsample_ratio, clip_bbox,
-                                                    scale_x_y, iou_aware,
-                                                    iou_aware_factor)
+        boxes, scores = _C_ops.yolo_box(x, img_size, anchors, class_num,
+                                        conf_thresh, downsample_ratio,
+                                        clip_bbox, scale_x_y, iou_aware,
+                                        iou_aware_factor)
         return boxes, scores
 
     if _non_static_mode():
-        boxes, scores = _C_ops.yolo_box(
+        boxes, scores = _legacy_C_ops.yolo_box(
             x, img_size, 'anchors', anchors, 'class_num', class_num,
             'conf_thresh', conf_thresh, 'downsample_ratio', downsample_ratio,
             'clip_bbox', clip_bbox, 'scale_x_y', scale_x_y, 'iou_aware',
@@ -556,10 +556,9 @@ def deform_conv2d(x,
     use_deform_conv2d_v1 = True if mask is None else False
 
     if in_dygraph_mode():
-        pre_bias = _C_ops.final_state_deformable_conv(x, offset, weight, mask,
-                                                      stride, padding, dilation,
-                                                      deformable_groups, groups,
-                                                      1)
+        pre_bias = _C_ops.deformable_conv(x, offset, weight, mask, stride,
+                                          padding, dilation, deformable_groups,
+                                          groups, 1)
         if bias is not None:
             out = nn.elementwise_add(pre_bias, bias, axis=1)
         else:
@@ -570,10 +569,12 @@ def deform_conv2d(x,
                  'im2col_step', 1)
         if use_deform_conv2d_v1:
             op_type = 'deformable_conv_v1'
-            pre_bias = getattr(_C_ops, op_type)(x, offset, weight, *attrs)
+            pre_bias = getattr(_legacy_C_ops, op_type)(x, offset, weight,
+                                                       *attrs)
         else:
             op_type = 'deformable_conv'
-            pre_bias = getattr(_C_ops, op_type)(x, offset, mask, weight, *attrs)
+            pre_bias = getattr(_legacy_C_ops, op_type)(x, offset, mask, weight,
+                                                       *attrs)
         if bias is not None:
             out = nn.elementwise_add(pre_bias, bias, axis=1)
         else:
@@ -901,7 +902,7 @@ def distribute_fpn_proposals(fpn_rois,
 
     if in_dygraph_mode():
         assert rois_num is not None, "rois_num should not be None in dygraph mode."
-        multi_rois, rois_num_per_level, restore_ind = _C_ops.final_state_distribute_fpn_proposals(
+        multi_rois, rois_num_per_level, restore_ind = _C_ops.distribute_fpn_proposals(
             fpn_rois, rois_num, min_level, max_level, refer_level, refer_scale,
             pixel_offset)
         return multi_rois, restore_ind, rois_num_per_level
@@ -911,7 +912,7 @@ def distribute_fpn_proposals(fpn_rois,
         attrs = ('min_level', min_level, 'max_level', max_level, 'refer_level',
                  refer_level, 'refer_scale', refer_scale, 'pixel_offset',
                  pixel_offset)
-        multi_rois, restore_ind, rois_num_per_level = _C_ops.distribute_fpn_proposals(
+        multi_rois, restore_ind, rois_num_per_level = _legacy_C_ops.distribute_fpn_proposals(
             fpn_rois, rois_num, num_lvl, num_lvl, *attrs)
         return multi_rois, restore_ind, rois_num_per_level
 
@@ -987,7 +988,7 @@ def read_file(filename, name=None):
     """
 
     if _non_static_mode():
-        return _C_ops.read_file('filename', filename)
+        return _legacy_C_ops.read_file('filename', filename)
 
     inputs = dict()
     attrs = {'filename': filename}
@@ -1038,7 +1039,7 @@ def decode_jpeg(x, mode='unchanged', name=None):
             print(img.shape)
     """
     if _non_static_mode():
-        return _C_ops.decode_jpeg(x, "mode", mode)
+        return _legacy_C_ops.decode_jpeg(x, "mode", mode)
 
     inputs = {'X': x}
     attrs = {"mode": mode}
@@ -1100,14 +1101,14 @@ def psroi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None):
             "Input features with shape should be (N, C, H, W)"
     output_channels = int(x.shape[1] / (pooled_height * pooled_width))
     if in_dygraph_mode():
-        return _C_ops.final_state_psroi_pool(x, boxes, boxes_num, pooled_height,
-                                             pooled_width, output_channels,
-                                             spatial_scale)
+        return _C_ops.psroi_pool(x, boxes, boxes_num, pooled_height,
+                                 pooled_width, output_channels, spatial_scale)
     if _in_legacy_dygraph():
-        return _C_ops.psroi_pool(x, boxes, boxes_num, "output_channels",
-                                 output_channels, "spatial_scale",
-                                 spatial_scale, "pooled_height", pooled_height,
-                                 "pooled_width", pooled_width)
+        return _legacy_C_ops.psroi_pool(x, boxes, boxes_num, "output_channels",
+                                        output_channels, "spatial_scale",
+                                        spatial_scale, "pooled_height",
+                                        pooled_height, "pooled_width",
+                                        pooled_width)
 
     helper = LayerHelper('psroi_pool', **locals())
     dtype = helper.input_dtype()
@@ -1216,14 +1217,13 @@ def roi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None):
     pooled_height, pooled_width = output_size
     if in_dygraph_mode():
         assert boxes_num is not None, "boxes_num should not be None in dygraph mode."
-        return _C_ops.final_state_roi_pool(x, boxes, boxes_num, pooled_height,
-                                           pooled_width, spatial_scale)
+        return _C_ops.roi_pool(x, boxes, boxes_num, pooled_height, pooled_width,
+                               spatial_scale)
     if _in_legacy_dygraph():
         assert boxes_num is not None, "boxes_num should not be None in dygraph mode."
-        pool_out, argmaxes = _C_ops.roi_pool(x, boxes, boxes_num,
-                                             "pooled_height", pooled_height,
-                                             "pooled_width", pooled_width,
-                                             "spatial_scale", spatial_scale)
+        pool_out, argmaxes = _legacy_C_ops.roi_pool(
+            x, boxes, boxes_num, "pooled_height", pooled_height, "pooled_width",
+            pooled_width, "spatial_scale", spatial_scale)
         return pool_out
 
     else:
@@ -1376,16 +1376,17 @@ def roi_align(x,
     pooled_height, pooled_width = output_size
     if in_dygraph_mode():
         assert boxes_num is not None, "boxes_num should not be None in dygraph mode."
-        return _C_ops.final_state_roi_align(x, boxes, boxes_num, pooled_height,
-                                            pooled_width, spatial_scale,
-                                            sampling_ratio, aligned)
+        return _C_ops.roi_align(x, boxes, boxes_num, pooled_height,
+                                pooled_width, spatial_scale, sampling_ratio,
+                                aligned)
     if _in_legacy_dygraph():
         assert boxes_num is not None, "boxes_num should not be None in dygraph mode."
-        align_out = _C_ops.roi_align(x, boxes, boxes_num, "pooled_height",
-                                     pooled_height, "pooled_width",
-                                     pooled_width, "spatial_scale",
-                                     spatial_scale, "sampling_ratio",
-                                     sampling_ratio, "aligned", aligned)
+        align_out = _legacy_C_ops.roi_align(x, boxes, boxes_num,
+                                            "pooled_height", pooled_height,
+                                            "pooled_width", pooled_width,
+                                            "spatial_scale", spatial_scale,
+                                            "sampling_ratio", sampling_ratio,
+                                            "aligned", aligned)
         return align_out
 
     else:
@@ -1591,10 +1592,10 @@ def nms(boxes,
 
     def _nms(boxes, iou_threshold):
         if in_dygraph_mode():
-            return _C_ops.final_state_nms(boxes, iou_threshold)
+            return _C_ops.nms(boxes, iou_threshold)
 
         if _non_static_mode():
-            return _C_ops.nms(boxes, 'iou_threshold', iou_threshold)
+            return _legacy_C_ops.nms(boxes, 'iou_threshold', iou_threshold)
 
         helper = LayerHelper('nms', **locals())
         out = helper.create_variable_for_type_inference('int64')
@@ -1750,7 +1751,7 @@ def generate_proposals(scores,
         assert return_rois_num, "return_rois_num should be True in dygraph mode."
         attrs = (pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, eta,
                  pixel_offset)
-        rpn_rois, rpn_roi_probs, rpn_rois_num = _C_ops.final_state_generate_proposals_v2(
+        rpn_rois, rpn_roi_probs, rpn_rois_num = _C_ops.generate_proposals_v2(
             scores, bbox_deltas, img_size, anchors, variances, *attrs)
 
         return rpn_rois, rpn_roi_probs, rpn_rois_num
@@ -1759,7 +1760,7 @@ def generate_proposals(scores,
         attrs = ('pre_nms_topN', pre_nms_top_n, 'post_nms_topN', post_nms_top_n,
                  'nms_thresh', nms_thresh, 'min_size', min_size, 'eta', eta,
                  'pixel_offset', pixel_offset)
-        rpn_rois, rpn_roi_probs, rpn_rois_num = _C_ops.generate_proposals_v2(
+        rpn_rois, rpn_roi_probs, rpn_rois_num = _legacy_C_ops.generate_proposals_v2(
             scores, bbox_deltas, img_size, anchors, variances, *attrs)
 
         return rpn_rois, rpn_roi_probs, rpn_rois_num
@@ -1904,10 +1905,11 @@ def matrix_nms(bboxes,
     check_type(background_label, 'background_label', int, 'matrix_nms')
 
     if in_dygraph_mode():
-        out, index, rois_num = _C_ops.final_state_matrix_nms(
-            bboxes, scores, score_threshold, nms_top_k, keep_top_k,
-            post_threshold, use_gaussian, gaussian_sigma, background_label,
-            normalized)
+        out, index, rois_num = _C_ops.matrix_nms(bboxes, scores,
+                                                 score_threshold, nms_top_k,
+                                                 keep_top_k, post_threshold,
+                                                 use_gaussian, gaussian_sigma,
+                                                 background_label, normalized)
         if not return_index:
             index = None
         if not return_rois_num:
@@ -1919,7 +1921,7 @@ def matrix_nms(bboxes,
                  nms_top_k, 'gaussian_sigma', gaussian_sigma, 'use_gaussian',
                  use_gaussian, 'keep_top_k', keep_top_k, 'normalized',
                  normalized)
-        out, index, rois_num = _C_ops.matrix_nms(bboxes, scores, *attrs)
+        out, index, rois_num = _legacy_C_ops.matrix_nms(bboxes, scores, *attrs)
         if not return_index:
             index = None
         if not return_rois_num:
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index a73a82f70b7dc..6b0f53ac52369 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -439,7 +439,7 @@ if [ "${RUNTYPE_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     fi
 fi
 
-SKIP_CI=`git log -2 |grep -w "test=document_fix" || true`
+SKIP_CI=`git log --pretty=oneline|grep $AGILE_REVISION |grep -w "test=document_fix" || true`
 if [[ ${SKIP_CI} ]];then
     echo_line="You must have one RD (tianshuo78520a (Recommend),zhiqiu ,phlrain ) approval you add test=document_fix method in commit skips CI"
     check_approval 1 tianshuo78520a zhiqiu phlrain
diff --git a/tools/count_api_without_core_ops.py b/tools/count_api_without_core_ops.py
index 6b5bffd332743..c408b30a13646 100644
--- a/tools/count_api_without_core_ops.py
+++ b/tools/count_api_without_core_ops.py
@@ -22,7 +22,7 @@
 import hashlib
 import functools
 import platform
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 
 __all__ = [
     'get_apis_with_and_without_core_ops',
@@ -210,7 +210,7 @@ def get_api_source_desc(modules):
 
     else:
         print("""Usage: 
-            1. Count and list all operator-raleated APIs that contains append_op but not _C_ops.xx. 
+            1. Count and list all operator-raleated APIs that contains append_op but not _legacy_C_ops.xx. 
                 python ./count_api_without_core_ops.py -c paddle
             2. Print api and the md5 of source code of the api.
                 python ./count_api_without_core_ops.py -p paddle
diff --git a/tools/gen_ut_cmakelists.py b/tools/gen_ut_cmakelists.py
index c5716dd10b562..b95739fed6104 100644
--- a/tools/gen_ut_cmakelists.py
+++ b/tools/gen_ut_cmakelists.py
@@ -13,6 +13,10 @@
 # limitations under the License.
 
 import re
+import os
+import argparse
+
+# port range (21200, 23000) is reserved for dist-ops
 
 
 # function to process pythonpath env
@@ -67,10 +71,10 @@ def process_conditions(conditions):
         Output: ""
     """
     if len(conditions.strip()) == 0:
-        conditions = ""
+        conditions = []
     else:
-        conditions = f" AND ({conditions})"
-    return conditions
+        conditions = conditions.strip().split(";")
+    return [c.strip() for c in conditions]
 
 
 def proccess_archs(arch):
@@ -90,8 +94,8 @@ def proccess_archs(arch):
     arch = arch.upper().strip()
     if len(arch) > 0:
         for a in arch.split(";"):
-            assert a in ["GPU", "ROCM", "ASCEND", "ASCEND_CL"], \
-                f"""Supported arhc options are "GPU", "ROCM", "ASCEND" and "ASCEND_CL", but the options is {a}"""
+            assert a in ["GPU", "ROCM", "ASCEND", "ASCEND_CL", "XPU"], \
+                f"""Supported arhc options are "GPU", "ROCM", "ASCEND" and "ASCEND_CL", "XPU", but the options is {a}"""
             archs += "WITH_" + a.upper() + " OR "
         arch = "(" + archs[:-4] + ")"
     else:
@@ -135,6 +139,34 @@ def process_run_serial(run_serial):
     return rs
 
 
+def file_with_extension(prefix, suffixes):
+    """
+    Desc:
+        check whether test file exists. 
+    """
+    for ext in suffixes:
+        if os.path.isfile(prefix + ext):
+            return True
+    return False
+
+
+def process_name(name, curdir):
+    """
+    Desc:
+        check whether name is with a legal format and check whther the test file exists.
+    """
+    name = name.strip()
+    assert re.compile("^test_[0-9a-zA-Z_]+").search(name), \
+        f"""If line is not the header of table, the test name must begin with "test_" """ \
+        f"""and the following substring must include at least one char of "0-9", "a-z", "A-Z" or "_"."""
+    filepath_prefix = os.path.join(curdir, name)
+    suffix = [".py", ".sh"]
+    assert file_with_extension(filepath_prefix, suffix), \
+        f""" Please ensure the test file with the prefix '{filepath_prefix}' and one of the suffix {suffix} exists, because you specified a unittest named '{name}'"""
+
+    return name
+
+
 def process_run_type(run_type):
     rt = run_type.strip()
     assert re.compile("^(NIGHTLY|EXCLUSIVE|CINN|DIST|GPUPS|INFER|EXCLUSIVE:NIGHTLY|DIST:NIGHTLY)$").search(rt), \
@@ -143,7 +175,18 @@ def process_run_type(run_type):
     return rt
 
 
-def parse_line(line):
+DIST_UT_PORT = 21200
+
+
+def process_dist_ut_port(port_num):
+    global DIST_UT_PORT
+    port = DIST_UT_PORT
+    assert port < 23000, "dist port is exahausted"
+    DIST_UT_PORT += int(port_num)
+    return port
+
+
+def parse_line(line, curdir):
     """
     Desc:
         Input a line in csv file and output a string in cmake grammer, adding the specified test and setting its properties.
@@ -161,24 +204,14 @@ def parse_line(line):
             endif()"
     """
 
-    # A line contains name, os_, archs, timeout, run_type, launcher, dist_ut_port, run_serial, envs, conditions, etc.
-    # Following are descriptions of each variable:
-    #
-    # * `name`: the test's name
-    # * `os`: The supported operator system, ignoring case. If the test run in multiple operator systems, use ";" to split systems, forexample, `apple;linux` means the test runs on both Apple and Linux. The supported values are `linux`,`win32` and `apple`. If the value is empty, this means the test runs on all opertaor systems.
-    # * `arch`: the device's architecture. similar to `os`, multiple valuse ars splited by ";" and ignoring case. The supported arhchetectures are `gpu`, `xpu`, `npu` and `rocm`.
-    # * `timeout`: timeout of a unittest, whose unit is second.
-    # * `run_type`: run_type of a unittest. Supported values are `NIGHTLY`, `EXCLUSIVE`, `CINN`, `DIST`, `GPUPS`, `INFER`, `EXCLUSIVE:NIGHTLY`, `DIST:NIGHTLY`，which are case-insensitive.
-    # * `launcher`: the test launcher.Supported values are test_runner.py, dist_test.sh and custom scripts' name.
-    # * `dist_ut_port`: the starting port used in a distributed unit test
-    # * `run_serial`: whether in serial mode. the value can be 1 or 0. Default(empty) is 0
-    # * `ENVS`: required environments. multiple envirenmonts are splited by ";".
-    # * `conditions`: extra required conditions for some tests. the value is a boolean expression in cmake programmer.
     name, os_, archs, timeout, run_type, launcher, dist_ut_port, run_serial, envs, conditions = line.strip(
     ).split(",")
 
+    # name == "name" means the line being parsed is the header of the table
+    # we should skip this line and return empty here.
     if name == "name":
         return ""
+    name = process_name(name, curdir)
 
     envs = process_envs(envs)
     conditions = process_conditions(conditions)
@@ -189,8 +222,13 @@ def parse_line(line):
 
     cmd = ""
 
+    for c in conditions:
+        cmd += f"if ({c})\n"
+
+    time_out_str = f'TIMEOUT "{timeout}"' if len(timeout.strip()) > 0 else ''
     if launcher[-3:] == ".sh":
-        cmd += f'''if({archs} AND {os_} {conditions})
+        dist_ut_port = process_dist_ut_port(2)
+        cmd += f'''if({archs} AND {os_})
     bash_test_modules(
     {name}
     START_BASH
@@ -199,27 +237,48 @@ def parse_line(line):
     "RUN_TYPE={run_type}"
     ENVS
     "PADDLE_DIST_UT_PORT={dist_ut_port};{envs}")
-    set_tests_properties({name} PROPERTIES  TIMEOUT "{timeout}" RUN_SERIAL {run_serial})
+    set_tests_properties({name} PROPERTIES  {time_out_str} RUN_SERIAL {run_serial})
 endif()
 '''
     else:
-        cmd += f'''if({archs} AND {os_} {conditions})
+        cmd += f'''if({archs} AND {os_})
     py_test_modules(
     {name}
     MODULES
     {name}
     ENVS
-    "PADDLE_DIST_UT_PORT={dist_ut_port};{envs}")
-    set_tests_properties({name} PROPERTIES  TIMEOUT "{timeout}" RUN_SERIAL {run_serial})
+    "{envs}")
+    set_tests_properties({name} PROPERTIES {time_out_str} RUN_SERIAL {run_serial})
 endif()
 '''
+    for _ in conditions:
+        cmd += f"endif()\n"
     return cmd
 
 
+PROCESSED_DIR = set()
+
+
 def gen_cmakelists(current_work_dir):
     print("procfessing dir:", current_work_dir)
     if current_work_dir == "":
         current_work_dir = "."
+
+    contents = os.listdir(current_work_dir)
+    sub_dirs = []
+    for c in contents:
+        c_path = os.path.join(current_work_dir, c)
+        if c_path in PROCESSED_DIR:
+            return
+        if os.path.isdir(c_path):
+            PROCESSED_DIR.add(c_path)
+            if os.path.isfile(os.path.join(current_work_dir, c, "testslist.csv")) \
+                or os.path.isfile(os.path.join(current_work_dir, c, "CMakeLists.txt")):
+                gen_cmakelists(os.path.join(current_work_dir, c))
+                sub_dirs.append(c)
+
+    if not os.path.isfile(os.path.join(current_work_dir, "testslist.csv")):
+        return
     cmds = """# This file is generated by ${PADDLE_ROOT}/tools/gen_ut_cmakelists.py.
 # Please don't modify this file manually.
 # If you need to change unittests in this file, please modify testslist.csv in the current directory 
@@ -229,7 +288,7 @@ def gen_cmakelists(current_work_dir):
     with open(f"{current_work_dir}/testslist.csv") as csv_file:
         for i, line in enumerate(csv_file.readlines()):
             try:
-                cmds += parse_line(line)
+                cmds += parse_line(line, current_work_dir)
             except Exception as e:
                 print("===============PARSE LINE ERRORS OCCUR==========")
                 print(e)
@@ -237,14 +296,14 @@ def gen_cmakelists(current_work_dir):
                 print(f"[ERROR LINE {i+1}]: {line.strip()}")
                 exit(1)
 
+    for sub in sub_dirs:
+        cmds += f"add_subdirectory({sub})\n"
     print(cmds, end="")
     with open(f"{current_work_dir}/CMakeLists.txt", "w") as cmake_file:
         print(cmds, end="", file=cmake_file)
 
 
 if __name__ == "__main__":
-    import os
-    import argparse
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--files",